In [1]:
from transformers import AutoTokenizer, BartForConditionalGeneration, T5ForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import json
import transformers

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def read_data(file_path):
    data = []
    with open(file_path, 'r') as fd:
        json_list = list(fd)

        for json_str in json_list:
            result = json.loads(json_str)
            data.append(result)
    return data

In [4]:
train_set = read_data('/kaggle/input/dialogue-system/train.jsonl')
dev_set = read_data('/kaggle/input/dialogue-system/dev.jsonl')

In [5]:
dev_set[0]

{'input': 'Email my my dad with directions to the to the wedding.',
 'history': [{'user_query': 'create a list name Garage Sale',
   'response_text': 'Sorry, it looks like you already have a list with that name. Do you still want to make a new one?'},
  {'user_query': 'add organize stuff and prepare flyers ',
   'response_text': 'OK, what do you want to call it?'},
  {'user_query': 'Garage Sale',
   'response_text': 'Sure, I made a list called "Garage Sale" and added those two things.'}],
 'user_lists': [{'name': 'Good Girl Names', 'items': ['Autumn', 'Summer']},
  {'name': 'fruit salad', 'items': ['strawberries']}],
 'user_notes': [{'name': "don't forget to pick up milk", 'content': ''},
  {'name': 'pick up milk', 'content': ''}],
 'user_contacts': ['Dad ',
  'my dad',
  'Dad',
  'My Honey',
  'James Keenan',
  'Stacy Willis',
  'Dr. Jerkins',
  'Publix Bakery',
  'Michelle',
  'my office',
  'Elizabeth',
  'Laurie',
  'Vet'],
 'output': 'Send_digital_object ( medium « Email » object 

In [6]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
def process_data(data):
    data_x, data_y = [], []
    for sample in data:
        data_x.append(sample['input'])
        data_y.append(sample['output'])
    return list(zip(data_x, data_y))

train = process_data(train_set)
dev = process_data(dev_set)

In [8]:
def collate(batch):
    X, Y = zip(*batch)
    X = tokenizer(X, padding = True, return_tensors = 'pt')
    Y = tokenizer(Y, padding = True, return_tensors = 'pt')
    length = len(Y['input_ids'])
    Y['input_ids'][Y['input_ids']==0] = -100
    return X, Y

In [9]:
EPOCHS = 15
PATIENCE = 20
BATCH_SIZE = 64
train_loader = DataLoader(train, batch_size = BATCH_SIZE, shuffle = True, collate_fn=collate)
dev_loader = DataLoader(dev, batch_size = BATCH_SIZE, collate_fn=collate)

In [10]:
a = next(iter(train_loader))

In [11]:
tokenizer.pad_token_id

0

In [12]:
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
lr_scheduler = transformers.get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=(EPOCHS+10)*len(train_loader))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
def train_model(train_loader, model):
    step_check = 50
    step_count = 0
    counter = 0
    val_best = float('inf')
    val_best_acc = -float('inf')
    running_loss = 0.0
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch+1}:')
        print('-'*20)
        for batch, (X,Y) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            
            input_ids = X['input_ids'].to(device)
            attention_mask = X['attention_mask'].to(device)
            decoder_input_ids = Y['input_ids'].to(device)
            outputs = model(input_ids, attention_mask, labels=decoder_input_ids)
            loss = outputs.loss
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            
            step_count+=1
            if step_count==step_check:
                print(f'Train_loss: {(running_loss/step_count):>5f}')
                val_loss = val_loop(dev_loader, model)
                if val_loss < val_best:
                    print(f'Val loss decreased: {val_best:7f} ----> {val_loss:7f}')
                    val_best = val_loss
                    torch.save(model.state_dict(), '/kaggle/working/aib222684_model')
                    counter=0
                else:
                    counter+=1
                if counter == PATIENCE:
                    break
                step_count = 0
                running_loss = 0.0
        if counter == PATIENCE:
            break
        val_acc = val_accuracy(dev_loader, model, dev_set)
        print(f'Validation accuracy: {val_acc:7f}')
        if val_acc > val_best_acc:
            val_best_acc = val_acc
            torch.save(model.state_dict(), '/kaggle/working/t5-acc.pth')
           

In [14]:
def val_accuracy(dev_loader, model, dev_set):
    model.eval()
    idx = 0
    acc = 0
    with torch.no_grad():
        for (X, _) in dev_loader:
            input_ids  = X["input_ids"].to(device)
            results = model.generate(input_ids, num_beams=3, min_length=0, max_length=150)
            results = tokenizer.batch_decode(results, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            for result in results:
                if result==dev_set[idx]['output']: acc+=1
                idx+=1
    return acc/len(dev_set)

In [15]:
def val_loop(dataloader, model):
    size = len(dataloader)
    cost = 0.0
    model.eval()
    for X,Y in dataloader:
        input_ids = X['input_ids'].to(device)
        attention_mask = X['attention_mask'].to(device)
        decoder_input_ids =  Y['input_ids'].to(device)
        output = model(input_ids, attention_mask, labels = decoder_input_ids)
        cost += output.loss.item()
    return cost/size

In [16]:
train_model(train_loader, model)

Epoch 1:
--------------------
Train_loss: 3.171859
Val loss decreased:     inf ----> 1.397334
Train_loss: 1.415407
Val loss decreased: 1.397334 ----> 0.636142
Train_loss: 0.922962
Val loss decreased: 0.636142 ----> 0.431073
Train_loss: 0.693979
Val loss decreased: 0.431073 ----> 0.324726
Train_loss: 0.553870
Val loss decreased: 0.324726 ----> 0.263594
Train_loss: 0.477244
Val loss decreased: 0.263594 ----> 0.222820
Train_loss: 0.407483
Val loss decreased: 0.222820 ----> 0.196596
Train_loss: 0.356708
Val loss decreased: 0.196596 ----> 0.166489
Train_loss: 0.324005
Val loss decreased: 0.166489 ----> 0.146739
Validation accuracy: 0.570643
Epoch 2:
--------------------
Train_loss: 0.288944
Val loss decreased: 0.146739 ----> 0.134109
Train_loss: 0.260854
Val loss decreased: 0.134109 ----> 0.125554
Train_loss: 0.249552
Val loss decreased: 0.125554 ----> 0.116227
Train_loss: 0.234483
Val loss decreased: 0.116227 ----> 0.108967
Train_loss: 0.221432
Val loss decreased: 0.108967 ----> 0.098709
T

In [16]:
X, Y = next(iter(dev_loader))
model_test = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
model_test.load_state_dict(torch.load('/kaggle/working/aib222684_model'))

<All keys matched successfully>

In [17]:
input_ids  = X["input_ids"].to(device)
result = model_test.generate(input_ids, num_beams=5, min_length=0, max_length=100)

In [20]:
tokenizer.batch_decode(result, skip_special_tokens=True, clean_up_tokenization_spaces=False)

['Send_digital_object ( medium « Email » object Electronic_message ( content « directions to the wedding » ) recipient Personal_contact ( person « my dad » ) )',
 'Send_digital_object ( format « video » recipient Personal_contact ( person « John » ) )',
 'Send_digital_object ( medium « text message » recipient Personal_contact ( person « Lauren » ) )',
 'Send_digital_object ( cc Personal_contact ( person « John » ) medium « Email » recipient Personal_contact ( person « Troy » ) )',
 'Send_digital_object ( medium « message » provider « Snapchat » )',
 'Send_digital_object ( medium « message » recipient Organization_contact ( organization « Fundido » ) )',
 'Send_digital_object ( attachment « ETA » medium « text » recipient Contactable_entity ( contact_id Phone_number ( id_form « 384-384-3948 » ) ) )',
 'Send_digital_object ( medium « text message » recipient Personal_contact ( person « Zander » ) )',
 'Send_digital_object ( medium « email » recipient Personal_contact ( person « Meliza »

In [21]:
def generate_file(fileName, dataloader, model):
    fd = open(fileName, 'w+')
    model.eval()
    for (X,_) in dataloader:
        input_ids  = X["input_ids"].to(device)
        results = model.generate(input_ids, num_beams=3, min_length=0, max_length=100)
        results = tokenizer.batch_decode(results, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        for result in results:
            fd.write(result+'\n')
    fd.close()

In [22]:
generate_file('pred_file.txt', dev_loader, model_test)

In [25]:
tokenizer.pad_token_id

0

In [None]:
val_accuracy(dev_loader, model, dev_set)