In [56]:
import inflect, json, sys, os
import regex as re 
from collections import defaultdict
from ruamel.yaml import YAML
from ruamel.yaml.scalarstring import LiteralScalarString
from datasets import load_dataset

In [2]:
data_train = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="train")
data_val = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="validation")
data_test = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="test")

In [3]:
labels = set()
for data in data_train:
    labels.add(data['services'][0])

In [4]:
labels

{'Banks_1',
 'Buses_1',
 'Buses_2',
 'Calendar_1',
 'Events_1',
 'Events_2',
 'Flights_1',
 'Flights_2',
 'Homes_1',
 'Hotels_1',
 'Hotels_2',
 'Hotels_3',
 'Media_1',
 'Movies_1',
 'Music_1',
 'Music_2',
 'RentalCars_1',
 'RentalCars_2',
 'Restaurants_1',
 'RideSharing_1',
 'RideSharing_2',
 'Services_1',
 'Services_2',
 'Services_3',
 'Travel_1',
 'Weather_1'}

In [5]:
label_set = set(label for label in labels if re.search(r"_1$",label))

In [6]:
label_set

{'Banks_1',
 'Buses_1',
 'Calendar_1',
 'Events_1',
 'Flights_1',
 'Homes_1',
 'Hotels_1',
 'Media_1',
 'Movies_1',
 'Music_1',
 'RentalCars_1',
 'Restaurants_1',
 'RideSharing_1',
 'Services_1',
 'Travel_1',
 'Weather_1'}

In [7]:
def get_user_data(data,i):
    return data['turns']['frames'][i]['slots'][0]['start'],\
            data['turns']['frames'][i]['slots'][0]['exclusive_end'],\
            data['turns']['frames'][i]['slots'][0]['slot'],\
            data['turns']['utterance'][i]
                
def get_system_data(data,i):
    return data['turns']['frames'][i]['slots'][0]['start'],\
            data['turns']['frames'][i]['slots'][0]['exclusive_end'],\
            data['turns']['frames'][i]['actions'][0]['values'],\
            data['turns']['frames'][i]['slots'][0]['slot'],\
            data['turns']['utterance'][i]
                

In [9]:
def get_user(start_idx, end_idx, slot_val, text):
    zipped = list(zip(start_idx, end_idx, slot_val))
    sorted_list = sorted(zipped, key=lambda x:x[0], reverse=True)
    for start, end, value in sorted_list:
        text = text[:start]+ f"[{text[start:end]}]" +f"({value})"+text[end:]
    return text

def get_sys(start_idx, end_idx, slot_name, slot_val, text):
    zipped = list(zip(start_idx, end_idx, slot_name, slot_val))
    sorted_list = sorted(zipped, key=lambda x:x[0], reverse=True)
    for start, end, name, value in sorted_list:
        text = text[:start]+ f"{{{value}}}"+text[end:]
    return text

In [45]:
def format_label(label):
    label = label.split('_')[0].lower()
    p = inflect.engine()
    if re.search(r's$',label):
        label = p.singular_noun(label)
    return label

def fetch(data_file, label,  nlu_data, domain_data, slots, stories):
    count=0

    for data in data_file:
        if data['services'][0] == label:
            intent = format_label(data['services'][0])
            utterence = data['turns']['utterance']

            for i in range(0, len(utterence),2):
                usr_strt_idx, usr_end_idx, usr_slot_val, usr_text  = get_user_data(data,i)
                sys_strt_idx, sys_end_idx, sys_slot_name, sys_slot_val, sys_text = get_system_data(data,i+1)
                user = get_user(usr_strt_idx, usr_end_idx, usr_slot_val, usr_text)
                system = get_sys(sys_strt_idx,sys_end_idx,sys_slot_name,sys_slot_val,sys_text)
                nlu_data[intent].append(user)
                domain_data['utter_'+intent].append(system)
                slots.update(usr_slot_val or [])
                slots.update(sys_slot_val or [])
                stories[intent].update(usr_slot_val or [])
                stories[intent].update(sys_slot_val or [])                
                count+=1
            if count>=34:
                return

In [46]:
nlu_data=defaultdict(list)
domain_data=defaultdict(list)
slots=set()
stories=defaultdict(set)

In [47]:
for label in label_set:
    fetch(data_train, label, nlu_data, domain_data, slots, stories)
    

In [13]:
intents = [format_label(label) for label in label_set]

In [90]:
nlu_yaml = {"version": "3.1", "nlu":[]}
domain_yaml = {"version": "3.1", "intent":intents,"slots":{} ,"responses":{}}
stories_yaml = {"version": "3.1", "stories":[]}

In [91]:
for intent, text in nlu_data.items():
    nlu_format = {
        "intent": intent,
        "examples": LiteralScalarString('\n'.join([f'- {t}' for t in text]) + '\n')
        }
    
    nlu_yaml['nlu'].append(nlu_format)

for intent, text in domain_data.items():
    domain_yaml['responses'][intent]=[{"text": txt} for txt in text]

for slot in slots:
    domain_yaml['slots'][slot]= {"type": "text"}

for story in stories:
    story_format = {
        "story": story+" path",
        "steps":[
            {'intent':'greet'},
            {'action':'utter_greet'},
            {'intent':story},
            {'action':'utter_'+story}
        ]
    }
    stories_yaml['stories'].append(story_format)

yaml = YAML()
yaml.default_flow_style = False
yaml.width = 4096
yaml.indent(mapping=2, sequence=2, offset=2)
yaml.preserve_quotes = True

In [92]:
yaml.block_seq_indent = 0
# with open("/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/rasa/data/nlu.yml", "w") as f:
#     yaml.dump(nlu_yaml, f)

with open("/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/rasa/data/stories.yml", "w") as f:
    yaml.dump(stories_yaml, f)
    
# yaml.block_seq_indent = 2
# with open("/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/rasa/domain.yml", "w") as f:
#     yaml.dump(domain_yaml, f)


In [191]:
def intent_tagging(train_data, stop=10):
    count=0
    for data in train_data:
        # print(data['turns']['frames'][0].keys())
        utterence = data['turns']['utterance']
        for i in range(len(utterence)):
            end_index = data['turns']['frames'][i]['slots'][0]['exclusive_end']
            slot_val = data['turns']['frames'][i]['slots'][0]['slot']
            start_index = data['turns']['frames'][i]['slots'][0]['start']
            slots = data['turns']['frames'][i]['actions'][0]['values']
            print("slot_val", slot_val)
            print(slots)
            print("start_index", start_index, "end_index", end_index)
            print(data['turns']['utterance'][i])
            if len(start_index)>1 and len(slot_val)>1:
                for idx in range(len(start_index)-1):
                    print(data['turns']['utterance'][i][:start_index[idx]] +\
                            # f"[{data['turns']['utterance'][i][start_index[idx]:end_index[idx]]}]"+\
                            f"{{{slot_val[idx]}}}" + \
                            data['turns']['utterance'][i][end_index[idx]:start_index[idx+1]]+ \
                            # f"[{data['turns']['utterance'][i][start_index[idx+1]:end_index[idx+1]]}]" +\
                            f"{{{slot_val[idx+1]}}}"+data['turns']['utterance'][i][end_index[idx+1]:])
            elif len(start_index)>1 and len(slot_val)==1:
                for idx in range(len(start_index)-1):
                    print(data['turns']['utterance'][i][:start_index[idx]] +\
                            # f"[{data['turns']['utterance'][i][start_index[idx]:end_index[idx]]}]"+\
                            f"{{{slot_val[0]}}}" + \
                            data['turns']['utterance'][i][end_index[idx]:start_index[idx+1]]+ \
                            # f"[{data['turns']['utterance'][i][start_index[idx+1]:end_index[idx+1]]}]" +\
                            f"{{{slot_val[0]}}}"+data['turns']['utterance'][i][end_index[idx+1]:])
            elif len(start_index)==1: 
                print(data['turns']['utterance'][i][:start_index[0]] +\
                    # f"[{data['turns']['utterance'][i][start_index[0]:end_index[0]]}]"+\
                    f"{{{slot_val[0]}}}" + \
                    data['turns']['utterance'][i][end_index[0]:])
                
        count+=1
        if count>stop:
            break

In [503]:
test = "The event will start at 10:30 pm and you need to go for a Movie show."
test[24:32]

slot_value = ['event_time', 'event_name']
start_index = [24, 47]
end_index =  [29, 66]
slot_name = ['Dentist appointment', '10 am']
sentence = "The event will start at 10 am and the event is Dentist appointment."

In [372]:
slots = list(zip(start_index, end_index, slot_value, slot_name))

In [492]:
slots = sorted(slots, key=lambda x: x[0], reverse=True)


In [374]:
for start, end, value, slot_name in slots:
        sentence = sentence[:start] + f"{{{value}}}" + sentence[end:]

In [375]:
sentence

'The event will start at {event_time} and the event is {event_name}.'

In [368]:
start_idx = [47, 24]
end_idx = [66, 29]
slot_names = [['Dentist appointment'], ['10 am']]
slot_val = ['event_name', 'event_time']
text = "The event will start at 10 am and the event is Dentist appointment."

output = get_sys(start_idx, end_idx, slot_names, slot_val, text)
print(output)


The event will start at {['10 am']} and the event is {['Dentist appointment']}.


In [194]:
intent_tagging(calendar_data,100)

slot_val []
[['GetEvents']]
start_index [] end_index []
I totally forgot my schedule and appointments of my calendar. Will you find events on my calendar?
slot_val []
[[]]
start_index [] end_index []
It is a pleasure for me, Which date i need to check?
slot_val ['event_date']
[['March 5th']]
start_index [18] end_index [27]
Actually it is on March 5th.
Actually it is on {event_date}.
slot_val ['event_name', 'event_time']
[['Dentist appointment'], ['10 am']]
start_index [47, 24] end_index [66, 29]
The event will start at 10 am and the event is Dentist appointment.
The event will start at 10 am and the event is {event_name}{event_time} and the event is Dentist appointment.
slot_val ['event_date']
[['March 13th'], []]
start_index [96] end_index [106]
Ok Fine.It is a very useful information. Do i have any other events scheduled? Will you find on March 13th?
Ok Fine.It is a very useful information. Do i have any other events scheduled? Will you find on {event_date}?
slot_val ['event_name', '

In [223]:
data_train

Dataset({
    features: ['dialogue_id', 'services', 'turns'],
    num_rows: 16142
})

In [176]:
def filter_data(dataset, domain):
    return dataset.filter(lambda x: any(domain in service for service in x['services']))

In [177]:
calendar_data = filter_data(data_train,'Calendar_1')

Filter:   0%|          | 0/16142 [00:00<?, ? examples/s]

In [593]:
calendar_data[0]['turns']

{'speaker': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
 'utterance': ['I totally forgot my schedule and appointments of my calendar. Will you find events on my calendar?',
  'It is a pleasure for me, Which date i need to check?',
  'Actually it is on March 5th.',
  'The event will start at 10 am and the event is Dentist appointment.',
  'Ok Fine.It is a very useful information. Do i have any other events scheduled? Will you find on March 13th?',
  'The event will start at 10:30 pm and you need to go for a Movie show.',
  "Ok that's great and i need to know whether i am available on that day. Please find also that?",
  'On March 13th, you have an emply slot starting at 8 pm and slot ends at 10:30 pm',
  "Ok that's great and i need to thank you for your wonderful work.",
  'Take care Have a great day'],
 'frames': [{'service': ['Calendar_1'],
   'slots': [{'slot': [], 'start': [], 'exclusive_end': []}],
   'state': [{'active_intent': 'GetEvents',
     'requested_slots': [],
     'slot_values': {'sl

In [397]:
t=0
i=3
for data in calendar_data:
    # print(data['turns']['frames'][0].keys())
    end_index = data['turns']['frames'][i]['slots'][0]['exclusive_end']
    slot_val = data['turns']['frames'][i]['slots'][0]['slot']
    # slot_val = "#####"
    start_index = data['turns']['frames'][i]['slots'][0]['start']
    slots =data['turns']['frames'][i]['actions'][0]['slot']
    print("]]]]]]",data['turns']['utterance'][i])
    # print(slot_val)
    if len(start_index)>1 and len(slot_val)>1:
        for idx in range(len(start_index)-1):
            print(data['turns']['utterance'][i][:start_index[idx]] +\
                    f"[{data['turns']['utterance'][i][start_index[idx]:end_index[idx]]}]"+\
                    f"({slot_val[idx]})" + \
                    data['turns']['utterance'][i][end_index[idx]:start_index[idx+1]]+ \
                    f"[{data['turns']['utterance'][i][start_index[idx+1]:end_index[idx+1]]}]" +\
                    f"({slot_val[idx+1]})"+data['turns']['utterance'][i][end_index[idx+1]:])
    elif len(start_index)>1 and len(slot_val)==1:
        for idx in range(len(start_index)-1):
            print(data['turns']['utterance'][i][:start_index[idx]] +\
                    f"[{data['turns']['utterance'][i][start_index[idx]:end_index[idx]]}]"+\
                    f"({slot_val[0]})" + \
                    data['turns']['utterance'][i][end_index[idx]:start_index[idx+1]]+ \
                    f"[{data['turns']['utterance'][i][start_index[idx+1]:end_index[idx+1]]}]" +\
                    f"({slot_val[0]})"+data['turns']['utterance'][i][end_index[idx+1]:])
    elif len(start_index)==1: 
        print(data['turns']['utterance'][i][:start_index[0]] +\
            f"[{data['turns']['utterance'][i][start_index[0]:end_index[0]]}]"+\
            f"({slot_val[idx]})" + \
            data['turns']['utterance'][i][end_index[idx]:])
    else:
        print('Missing indexes', start_index)
    print('----------##########----------')
    print(data['turns']['utterance'][i][:])
    print('----------##########----------')
    print(data['turns']['frames'][i]['service'])
    print("Starts with ---->",data['turns']['frames'][i]['slots'][0])
    print(data['turns']['frames'][i]['state'])
    print(data['turns']['frames'][i]['actions'])
    print(data['turns']['frames'][i]['service_results'])
    print(data['turns']['frames'][i]['service_call'])   
    print(slot_val) 
    t+=1
    if t>200:
        break

]]]]]] The event will start at 10 am and the event is Dentist appointment.
The event will start at 10 am and the event is [Dentist appointment](event_name)[10 am](event_time) and the event is Dentist appointment.
----------##########----------
The event will start at 10 am and the event is Dentist appointment.
----------##########----------
['Calendar_1']
Starts with ----> {'slot': ['event_name', 'event_time'], 'start': [47, 24], 'exclusive_end': [66, 29]}
[{'active_intent': '', 'requested_slots': [], 'slot_values': {'slot_name': [], 'slot_value_list': []}}]
[{'act': [11, 11], 'slot': ['event_name', 'event_time'], 'canonical_values': [['Dentist appointment'], ['10:00']], 'values': [['Dentist appointment'], ['10 am']]}]
[{'service_results_list': [{'service_slot_name': ['event_date', 'event_location', 'event_name', 'event_time'], 'service_canonical_value': ['2019-03-05', '88 Tully Road #112', 'Dentist appointment', '10:00']}]}]
[{'method': 'GetEvents', 'parameters': {'parameter_slot_name

In [1]:
(['service', 'slots', 'state', 'actions', 'service_results', 'service_call'])

['service', 'slots', 'state', 'actions', 'service_results', 'service_call']