In [1]:
from datasets import load_dataset
import inflect
import regex as re
import json

In [2]:
data_train = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="train")
data_val = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="validation")
data_test = load_dataset("schema_guided_dstc8", trust_remote_code=True, split="test")

In [3]:
def format_label(label):
    label = label.split('_')[0].lower()
    p = inflect.engine()
    if re.fullmatch(r'\w+s',label):
        label = p.singular_noun(label)
    return label

def format_data_sgd(data):
    if not data.features.keys()=={'dialogue_id', 'services', 'turns'}:
        raise ValueError("Data is not in correct format, please check the format")
    dataset, label_list = [], set()
    i=0
    for d in data:
        utterance = d['turns']['utterance']
 
        for i in range(0,len(utterance),2):
            label = d['turns']['frames'][i]['service'][0]
            domain = format_label(label)
            user = f"[Domain: {domain}] User: {utterance[i]}"
            system = utterance[i+1]
            dataset.append({"input":user, 'output':system})
            label_list.add(domain)
   
    return dataset, label_list

In [4]:
train_dataset, labels_train = format_data_sgd(data_train)
val_dataset, labels_val = format_data_sgd(data_val)
test_dataset, labels_test = format_data_sgd(data_test)

In [5]:
def filter_data(dataset, domain):
    return dataset.filter(lambda x: any(domain in service for service in x['services']))

In [16]:
import boto3
s3 = boto3.client('s3')

In [26]:
path_dir = '/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/data/val.json'

In [27]:
s3.upload_file(Filename=path_dir, Bucket='gen-ai-repository', Key='finetuning/flan-t5/data/val.json')

In [15]:
def convert_into_json(path,data):
    with open(path, 'w') as f:
        json.dump(data, f, indent=3)

In [16]:
path = "/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/data/val.json"
convert_into_json(path,val_dataset)

In [8]:
import sys, os
sys.path.append(os.path.abspath('..'))

In [9]:
from src.utils.loader import tokenize_data

In [10]:
from transformers import T5Tokenizer

In [11]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")  # or "t5-small", "t5-large", etc.

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [18]:
train, val, _, _ = tokenize_data(path='/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/data', task_type='seq2seq', tokenizer=tokenizer, max_length=64)

Map:   0%|          | 0/164982 [00:00<?, ? examples/s]

Map:   0%|          | 0/164982 [00:00<?, ? examples/s]

In [None]:
data_multi_woz = load_dataset('pfb30/multi_woz_v22',trust_remote_code=True, split='train')

def format_data_multi(data):
    if not data.features.keys()=={'dialogue_id', 'services', 'turns'}:
        raise ValueError("Data is not in correct format, please check the format")
    dataset, label_list = [], set()
    i=0
    for d in data:
        utterance = d['turns']['utterance']
        try: 
            domain = d['turns']['frames'][0]['service']
            if len(domain)>1:
                continue
            else:
                domain=domain[0]
        except:
            continue
        label_list.add(domain)
        for i in range(0,len(utterance),2):
            user = f"[Domain: {domain}] User: {utterance[i]}"
            system = utterance[i+1]
            dataset.append({"input":user, 'output':system})
   
    return dataset, label_list

dataset_mutli, labels_multi = format_data_multi(data_multi_woz)