In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Dialogue parsing model for task-oriented dialogue systems
Using pre-trained models BART
Your model should take user utterance as input and predict the parsed output in the given
format. As a starting point, you can consider the task as a sequence generation where you
generate the parsed output given the user input. You can then fine-tune BART on the task-specific data.
You are expected to decide how to represent input and outputs to PLMs during training and inference.

All train, evaluation and sample test data are given here. All files use utf-8 encoding.
1. There are 31k training samples given in the train.jsonl file. The format of the data is
jsonl where each line represents a training sample serialized in json format.
This includes incorporating a combination of different input fields like input, history, user_lists, etc..
2. There are 9.2k evaluation samples given in the dev.jsonl file. It follows jsonl format
same as in training data except “pattern” field
3. Sample test input and output data are given in the sample_test.jsonl and
sample_output.txt files. Following a realistic setting where the linguistic pattern for a
sample is not known beforehand, the field “pattern” is not available in the test data.
These i/o formats are to be followed strictly.

Jsonl file entries are as follows:
You can read about intents and slot-value
1. input: the user utterances
2. history: a list of past user and system utterances
3. user_lists: List of user’s list. A user list is a named collection of items curated by the
user. See example below.
4. user_notes: List of user's notes. A user note consists of a title and text. See example
below
5. user_contacts: List of user's contacts
6. output: String indicating parsing output
7. pattern: Linguistic pattern in the user utterance

Sample output:
Consider the output string “Send_digital_object ( medium « Message » recipient
Personal_contact ( person « my mom » ) )”. Here, Send_digital_object is the intent and
(medium, Message) and (recipient Personal_contact, (person, my mom) are slot-value pairs.
Note that the data has nested slot-value pairs as well as can be seen for (person, my mom).
There are a total 34 intents and 303 slot types in the dataset.

Sample input:
{"input": "Email um please email sarah@gmail.com", "history": [{"user_query": "Could you create a list for me named Costco", "response_text": "Sorry, it looks like you already have a list with that name. Do you still want to make a new one?"}, {"user_query": "Add pears and bananas on it", "response_text": "Got it, I added pears and bananas."}, {"user_query": "apples", "response_text": "Alright, what do you want to call it?"}, {"user_query": "berries", "response_text": "Sure, I made a list called \"Berries\" and added apples."}, {"user_query": "bread", "response_text": "Got it, what do you want to call it?"}], "user_lists": [{"name": "shopping", "items": ["bananas", "shoes"]}, {"name": "office supplies", "items": ["backpacks", "peanuts", "pens"]}, {"name": "nut", "items": ["almond", "acorn"]}, {"name": "Kroger shopping", "items": ["red wine vinegar"]}, {"name": "photo", "items": ["grad pictures"]}, {"name": "fruit salad", "items": ["strawberries"]}], "user_notes": [{"name": "Grocery List", "content": ""}, {"name": "pick up Kennedi at 12", "content": ""}], "user_contacts": ["Eman", "Berry", "Vera", "Clarence", "Phil Leeper", "Toya", "HR", "Kade", "Ariana", "Dr Robertson"], "output": "Send_digital_object ( medium \u00ab email \u00bb recipient Contactable_entity ( contact_id Email_address ( id_form \u00ab sarah@gmail.com \u00bb ) ) )", "pattern": "disfluency"}


In [None]:
# Importing libraries
import argparse
import json
import os
import time
from pathlib import Path
from typing import List, Optional, Dict
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

from transformers import (
    T5ForConditionalGeneration,
    BartModel,
    BartForConditionalGeneration,
    T5Tokenizer,
    BartTokenizer,
    BartConfig,
    AdamW,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

In [None]:
print(torch.__version__)

In [None]:
def load_data(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [None]:
train_path = "/kaggle/input/assistant-data/train.jsonl"
dev_path = "/kaggle/input/assistant-data/dev.jsonl"
train_data = load_data(train_path)
dev_data = load_data(dev_path)

In [None]:
f = open(train_path)
i = 0
for line in f:
    if i > 0:
        break
    i += 1
    print(line)

In [None]:
batch_size = 4
max_input_length = 512
max_output_length = 128

In [None]:

# Define the model
# model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
# #model = BartModel.from_pretrained("facebook/bart-large")
# tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

model = T5ForConditionalGeneration.from_pretrained("t5-base")
#model = BartModel.from_pretrained("facebook/bart-large")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
loss_fn.to(device)

In [None]:
def preprocess_input(data_sample):
    # Implement the preprocessing function
    input_text = data_sample['input']
    history_text = ' '.join([f"user_query: {entry['user_query']}. response_text: {entry['response_text']}." for entry in data_sample['history']])
    user_lists_text = ' '.join([f"{user_list['name']}: {', '.join(user_list['items'])}." for user_list in data_sample['user_lists']])
    user_notes_text = ' '.join([f"{user_note['name']}: {user_note['content']}." for user_note in data_sample['user_notes']])
    user_contacts_text = ', '.join(data_sample['user_contacts'])

    return f"{input_text} [history] {history_text} [user_lists] {user_lists_text} [user_notes] {user_notes_text} [user_contacts] {user_contacts_text}"

def preprocess_output(data_sample):
    output = data_sample["output"]
    pattern = data_sample["pattern"]
    if pattern == "":
        pattern = "fluency"
    return f"{output} [pattern] {pattern}"

In [None]:
l = []
i = 1
for sample in train_data:
    if sample["pattern"] == "":
        l.append(i+1)
    i += 1
print(len(l))

In [None]:
def preprocessor_inp(data):
    inp, out = [], []
    for sample in data:
        inp.append(preprocess_input(sample))
        out.append(preprocess_output(sample))
        out.append(sample["output"])
    return inp, out

def preprocessor_dev(data):
    inp, out = [], []
    for sample in data:
        inp.append(preprocess_input(sample))
        #out.append(preprocess_output(sample))
        out.append(sample["output"])
    return inp, out

def preprocessor_inp(data):
    inp, out = [], []
    for sample in data:
        inp.append(preprocess_input(sample))
        out.append(preprocess_output(sample))
    return inp, out

In [None]:
X_train, Y_train = preprocessor_inp(train_data)
X_dev, Y_dev = preprocessor_dev(dev_data)

In [None]:
X_train_tokenized = tokenizer(X_train,padding='max_length', truncation=True, max_length=max_input_length, return_tensors='pt')
Y_train_tokenized = tokenizer(Y_train,padding='max_length', truncation=True, max_length=max_output_length, return_tensors='pt')
X_dev_tokenized = tokenizer(X_dev,padding='max_length', truncation=True, max_length=max_input_length, return_tensors='pt')
Y_dev_tokenized = tokenizer(Y_dev,padding='max_length', truncation=True, max_length=max_output_length, return_tensors='pt')

In [None]:
rand_rows = torch.randperm(1024)[:10]
print(rand_rows)
Y_train_tokenized["attention_mask"]

In [None]:
class dataset(Dataset):
    def __init__(self, input_encodings, output_encodings, size = 0):
        self.size = size
        if size:
            rows = torch.randperm(len(output_encodings["input_ids"]))[:size]
            self.input_ids = input_encodings["input_ids"][rows]
            self.attention_mask = input_encodings["attention_mask"][rows]
            self.outputs = output_encodings["input_ids"][rows]
        else:
            self.input_ids = input_encodings["input_ids"]
            self.attention_mask = input_encodings["attention_mask"]
            self.outputs = output_encodings["input_ids"]
    def __len__(self):
        if self.size:
            return self.size
        else:
            return len(self.input_ids)
    def __getitem__(self,idx):
        item = dict()
        item["input_ids"] = self.input_ids[idx].to(device)
        item["attention_mask"] = self.attention_mask[idx].to(device)
        item["labels"] = self.outputs[idx].to(device)
        return item

In [None]:
# train_dataset = dataset(X_train_tokenized, Y_train_tokenized)
# dev_dataset = dataset(X_dev_tokenized, Y_dev_tokenized)
train_dataset = dataset(X_train_tokenized, Y_train_tokenized)
dev_dataset = dataset(X_dev_tokenized, Y_dev_tokenized)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)

In [None]:
#saved_model = torch.load("/kaggle/input/todp-2/model1.pth") 
#model.load_state_dict(saved_model)

In [None]:
lr = 5e-5
num_epochs = 4

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

num_training_steps = num_epochs * len(train_dataloader)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
def test(model, dev_dataloader):
    model.eval()
    pred_outputs = []
    gold_outputs = []
    with torch.no_grad():
        for batch in dev_dataloader:
                    input_ids, attention_mask, labels = [batch[x] for x in batch]
        #outputs = model(input_ids.view(1,-1), attention_mask=attention_mask.view(1,-1), labels=labels.view(1,-1))
                    batch_outputs = model.generate(input_ids, num_beams=2, min_length=0, max_length=max_output_length, pad_token_id = tokenizer.pad_token_id)
                    #print(input_ids)
                    #print(batch_outputs)
        #outputs = model.generate(input_ids.view(1,-1), num_beams=2, min_length=0, max_length=max_output_length)
                    predictions = tokenizer.batch_decode(batch_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                    predictions = [pred.split(" [pattern]")[0] for pred in predictions]
                    pred_outputs.extend(predictions)
                    gold_outputs.extend(tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False))
    return gold_outputs, pred_outputs

In [None]:
import sys


def parse(tokens):
    if "(" not in tokens:
        assert ")" not in tokens
        ret = dict()
        start = 0
        mid = 0
        for ii, tok in enumerate(tokens):
            if tok == "«":
                mid = ii
            elif tok == "»":
                key = ' '.join(tokens[start:mid])
                val = ' '.join(tokens[mid + 1:ii])
                ret[key] = val
                start = mid = ii + 1
        return ret

    st = tokens.index("(")
    outer_key = ' '.join(tokens[0:st])
    assert tokens[-1] == ")", " ".join(tokens)

    level = 0
    last = st + 1
    ret = dict()
    for ii in range(st + 1, len(tokens) - 1, 1):
        tok = tokens[ii]
        if tok == "»" and level == 0:
            rr = parse(tokens[last:ii + 1])
            ret.update(rr)
            last = ii + 1
        elif tok == "(":
            level += 1
        elif tok == ")":
            level -= 1
            if level == 0:
                rr = parse(tokens[last:ii + 1])
                ret.update(rr)
                last = ii + 1

    return {outer_key: ret}


def load_jsonl(fname):
    data = []
    with open(fname, 'r', encoding='utf-8') as fp:
        for line in fp:
            data.append(json.loads(line.strip()))

    return data


def per_sample_metric(gold, pred):
    ret = dict()
    ret['accuracy'] = int(gold == pred)

    get_intent = lambda x: x.split('(', 1)[0].strip()
    gintent = get_intent(gold)
    pintent = get_intent(pred)
    ret['intent_accuracy'] = int(gintent == pintent)

    parse_correct = 1
    try:
        _ = parse(pred.split())
    except:
        parse_correct = 0
    ret['parsing_accuracy'] = parse_correct

    return ret


def compute_metrics(golds, preds):
    assert len(golds) == len(preds), "Different number of samples in data and prediction."

    #golds = [x['output'] for x in data]

    metrics = [per_sample_metric(gold, pred) for gold, pred in zip(golds, preds)]
    final_metrics = dict()
    mnames = list(metrics[0].keys())
    for key in mnames:
        final_metrics[key] = sum([met[key] for met in metrics]) / len(golds)
    
    return final_metrics


In [None]:
def train(model, train_dataloader, dev_dataloader, num_epochs, scheduler, optimizer):
    for epoch in tqdm(range(num_epochs)):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        model.train()
        total_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = [batch[x] for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        print(f"Training loss: {total_loss / len(train_dataloader)}")
#         model.eval()
#         total_loss = 0
#         with torch.no_grad():
#             for batch in dev_dataloader:
#                 input_ids, attention_mask, labels = [batch[x] for x in batch]
#                 outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#                 loss = outputs.loss

#                 total_loss += loss.item()
        gold_outputs, pred_outputs = test(model, dev_dataloader)
        print("Validating")
        print(compute_metrics(gold_outputs, pred_outputs))

#         print(f"Validation loss: {total_loss / len(dev_dataloader)}")

In [None]:
train(model, train_dataloader, dev_dataloader, num_epochs, scheduler, optimizer)

In [None]:
dev_dataset[0]["input_ids"].shape

In [None]:
batch = dev_dataset[0]
input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
input_ids.view(1,-1).shape

In [None]:
#gold_outputs, pred_outputs = test(model, dev_dataloader)

In [None]:
#len(pred_outputs)

In [None]:
#print(gold_outputs[0])

In [None]:
#print(pred_outputs[0])

In [None]:
#print(compute_metrics(gold_outputs, pred_outputs))

In [None]:
torch.save(model.state_dict(), "model1.pth")