In [118]:
import inflect, json, sys, os
import regex as re 
from collections import defaultdict
from ruamel.yaml import YAML
from ruamel.yaml.scalarstring import LiteralScalarString

In [2]:
sys.path.append(os.path.abspath('..'))

In [3]:
path = "/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/data/data_full.json"

In [4]:
from intent_classifier.src.utils.loader import load_data

In [5]:
train, val = load_data(path)

In [6]:
train

Dataset({
    features: ['input', 'label'],
    num_rows: 15100
})

In [7]:
nlu_data = defaultdict(list)

In [8]:
for data in train:
    intent = data['label']
    text = data['input']
    nlu_data[intent].append(text)

In [9]:
nlu_yaml = {"version": "3.1", "nlu":[]}

In [10]:
for intent, text in nlu_data.items():
    formatted_data = {
        "intent": intent,
        "examples": LiteralScalarString('\n'.join([f'- {t}' for t in text]) + '\n')
        }
    nlu_yaml['nlu'].append(formatted_data)

yaml = YAML()
yaml.default_flow_style = False
yaml.indent(mapping=2, sequence=2, offset=2)
yaml.preserve_quotes = True
yaml.block_seq_indent = 0

In [11]:
with open("/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/rasa/data/nlu_train.yml", "w") as f:
    yaml.dump(nlu_yaml, f)

In [12]:
from datasets import load_dataset

In [13]:
data_train = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="train")
data_val = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="validation")
data_test = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="test")

In [14]:
i=0
for data in data_train:
    if data['services'][0]=='Banks_1':
        
        print("||||---->",data['turns']['utterance'])
        print("Slots---->",data['turns']['frames'][9]['slots'])
        print("Service result---->",data['turns']['frames'][9]['service_results'])
        print("Action---->",data['turns']['frames'][9]['actions'])
        print("State---->",data['turns']['frames'][9]['state'])
    
        d=data['turns']['frames'][9]
        print('######')
        i+=1
        if i>2:
            break

||||----> ["What's my balance?", 'In checking or savings?', 'In checking.', 'Your checking account has $5,118.77.', 'Ok, I want to transfer some money.', 'To whom?', 'To Amir.', 'And what amount would you like to transfer?', 'Send $1,630.', "Please confirm: Transfer $1,630 from your checking account to Amir's checking account.", 'Confirmed.', 'Your transfer is complete.', "Thanks, what's my balance?", 'Your checking account has $3,488.77.', 'Ok, bye.', 'Have a nice day.']
Slots----> [{'slot': ['amount', 'recipient_account_name'], 'start': [25, 62], 'exclusive_end': [31, 66]}]
Service result----> [{'service_results_list': []}]
Action----> [{'act': [2, 2, 2, 2], 'slot': ['account_type', 'amount', 'recipient_account_name', 'recipient_account_type'], 'canonical_values': [['checking'], ['1630'], ['Amir'], ['checking']], 'values': [['checking'], ['$1,630'], ['Amir'], ['checking']]}]
State----> [{'active_intent': '', 'requested_slots': [], 'slot_values': {'slot_name': [], 'slot_value_list': [

In [16]:
labels = set()
j = 0
for data in data_train:
    labels.add(data['services'][0])
    j+=1

In [17]:
labels

{'Banks_1',
 'Buses_1',
 'Buses_2',
 'Calendar_1',
 'Events_1',
 'Events_2',
 'Flights_1',
 'Flights_2',
 'Homes_1',
 'Hotels_1',
 'Hotels_2',
 'Hotels_3',
 'Media_1',
 'Movies_1',
 'Music_1',
 'Music_2',
 'RentalCars_1',
 'RentalCars_2',
 'Restaurants_1',
 'RideSharing_1',
 'RideSharing_2',
 'Services_1',
 'Services_2',
 'Services_3',
 'Travel_1',
 'Weather_1'}

In [148]:
(['service', 'slots', 'state', 'actions', 'service_results', 'service_call'])

['service', 'slots', 'state', 'actions', 'service_results', 'service_call']

In [166]:
data_train

Dataset({
    features: ['dialogue_id', 'services', 'turns'],
    num_rows: 16142
})

In [25]:
count=0
nlu_bank=[]
domain_bank=[]
stories_bank=[]
entities = []
values = []
# domains = set()
for data in data_train:
    if data['services'][0] == 'Hotels_2':
        utterence = data['turns']['utterance']
        slots = data['turns']['frames']
        
        for i in range(0, len(utterence)):
            slot_value = slots[i]['actions'][0]['slot']
            canonical_value = slots[i]['actions'][0]['canonical_values']
            entities.append(slot_value)
            values.append(canonical_value)

        for i in range(0, len(utterence),2):
            user = utterence[i]
            system = utterence[i+1]
            nlu_bank.append(user)
            domain_bank.append(system)
            count+=1
    if count>=34:
        break

In [89]:
label_set = set(label for label in labels if re.search(r"_1$",label))

In [90]:
label_set

{'Banks_1',
 'Buses_1',
 'Calendar_1',
 'Events_1',
 'Flights_1',
 'Homes_1',
 'Hotels_1',
 'Media_1',
 'Movies_1',
 'Music_1',
 'RentalCars_1',
 'Restaurants_1',
 'RideSharing_1',
 'Services_1',
 'Travel_1',
 'Weather_1'}

[]

In [144]:
def format_label(label):
    label = label.split('_')[0].lower()
    p = inflect.engine()
    if re.search(r's$',label):
        label = p.singular_noun(label)
    return label

def fetch(data_file, label,  nlu_data, domain_data, stories_data, entities, values):
    count=0

    for data in data_file:
        if data['services'][0] == label:
            intent = format_label(data['services'][0])
            utterence = data['turns']['utterance']
            slots = data['turns']['frames']
            
            for i in range(0, len(utterence)):
                slot_value = slots[i]['actions'][0]['slot']
                canonical_value = slots[i]['actions'][0]['canonical_values']
                entities.append(slot_value)
                values.append(canonical_value)

            for i in range(0, len(utterence),2):
                user = utterence[i]
                system = utterence[i+1]
                nlu_data[intent].append(user)
                domain_data['utter_'+intent].append(system)
                count+=1
            if count>=34:
                return              

In [None]:
nlu_data=defaultdict(list)
domain_data=defaultdict(list)
stories_data=[]
entities = []
values = []

In [146]:
for label in label_set:
    fetch(data_train, label, nlu_data, domain_data, stories_data, entities, values)
    

In [189]:
intents = [format_label(label) for label in label_set]

In [None]:
nlu_yaml = {"version": "3.1", "nlu":[]}
domain_yaml = {"version": "3.1", "intent":intents, "responses":{}}

In [191]:
for intent, text in nlu_data.items():
    nlu_format = {
        "intent": intent,
        "examples": LiteralScalarString('\n'.join([f'- {t}' for t in text]) + '\n')
        }
    
    nlu_yaml['nlu'].append(nlu_format)

for intent, text in domain_data.items():
    
    domain_yaml['responses'][intent]=[{"text": txt} for txt in text]

yaml = YAML()
yaml.default_flow_style = False
yaml.width = 4096
yaml.indent(mapping=2, sequence=2, offset=2)
yaml.preserve_quotes = True
yaml.block_seq_indent = 0

In [192]:
with open("/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/rasa/data/nlu_I.yml", "w") as f:
    yaml.dump(nlu_yaml, f)

with open("/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/rasa/data/domain_II.yml", "w") as f:
    yaml.dump(domain_yaml, f)

In [245]:
t=0
for data in data_train:
    # print(data['turns']['frames'][0].keys())
    print(data['turns']['utterance'][5][14:28])
    print(data['turns']['utterance'][5][68:76])
    print(data['turns']['frames'][5]['service'])
    print(data['turns']['frames'][5]['slots'])
    print(data['turns']['frames'][5]['state'])
    print(data['turns']['frames'][5]['actions'])
    print(data['turns']['frames'][5]['service_results'])
    print(data['turns']['frames'][5]['service_call'])    
    t+=1
    if t>1:
        break

71 Saint Peter
San Jose
['Restaurants_1']
[{'slot': ['restaurant_name', 'city'], 'start': [14, 68], 'exclusive_end': [28, 76]}]
[{'active_intent': '', 'requested_slots': [], 'slot_values': {'slot_name': [], 'slot_value_list': []}}]
[{'act': [11, 11], 'slot': ['restaurant_name', 'city'], 'canonical_values': [['71 Saint Peter'], ['San Jose']], 'values': [['71 Saint Peter'], ['San Jose']]}]
[{'service_results_list': [{'service_slot_name': ['city', 'cuisine', 'has_live_music', 'phone_number', 'price_range', 'restaurant_name', 'serves_alcohol', 'street_address'], 'service_canonical_value': ['San Jose', 'American', 'False', '408-971-8523', 'moderate', '71 Saint Peter', 'True', '71 North San Pedro Street']}, {'service_slot_name': ['city', 'cuisine', 'has_live_music', 'phone_number', 'price_range', 'restaurant_name', 'serves_alcohol', 'street_address'], 'service_canonical_value': ['San Jose', 'American', 'False', '408-261-5787', 'moderate', 'Bazille', 'True', '2400 Forest Avenue']}, {'service_

In [236]:
(['service', 'slots', 'state', 'actions', 'service_results', 'service_call'])

['service', 'slots', 'state', 'actions', 'service_results', 'service_call']