In [90]:
import jsonlines, argparse, random, os, sys
from collections import defaultdict
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
import numpy as np
import transformers
from tqdm import tqdm
from transformers import TrainingArguments
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, set_seed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [91]:
args = {
    'train_path': 'data/train.jsonl',
    'dev_path': 'data/dev.jsonl',
    'model': 'gpt2',
    'model_path': 'cs1190444_cs1190673_model',
    'num_epochs': 10,
    'batch_size': 32,
    'init_lr': 1e-5,
    'use_random_split': False
}

In [92]:
class TODDataset(Dataset):
    def __init__(self, data_paths, tokenizer, aux_input=False):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_mask = []
        for data_path in data_paths:
            print('Loading data from {}'.format(data_path))
            with jsonlines.open(data_path) as reader:
                for obj in reader:
                    input_raw = self.get_input(obj) if not aux_input else self.get_aux_input(obj)
                    output_raw = self.get_output(obj)
                    text = "<bos> " + input_raw.strip() + " <sep> " + output_raw.strip() + " <eos>"
                    encoded_input = tokenizer(text, return_tensors='pt', return_attention_mask=True)
                    self.input_ids.append(encoded_input['input_ids'][0])
                    self.attention_mask.append(encoded_input['attention_mask'][0])
        print("Number of examples: {}".format(len(self.input_ids))) 
    def __len__(self):

        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {"input_ids": self.input_ids[idx], "attention_mask": self.attention_mask[idx]}
    
    def get_input(self, obj):
        input_str = obj['input']
        return input_str

    def get_aux_input(self, obj):
        input_str = obj['input']
        history_str = ' '.join([x['user_query'] + ' ' + x['response_text'] for x in obj['history']])
        user_lists_str = ' '.join([l['name'] + ' ' + ' '.join(l['items']) for l in obj['user_lists']])
        user_notes_str = ' '.join([n['name'] + ' ' + n['content'] for n in obj['user_notes']])
        user_contacts_str = ' '.join(obj['user_contacts'])
        return input_str + ' ' + history_str + ' ' + user_lists_str + ' ' + user_notes_str + ' ' + user_contacts_str

    def get_output(self, obj):
        output_str = obj['output']
        return output_str
    
def collate_fn(batch):
    # Group samples by length
    batch = sorted(batch, key=lambda x: x['input_ids'].size(0))
    
    # Get input_ids and attention_mask tensors
    input_ids = torch.nn.utils.rnn.pad_sequence([x['input_ids'] for x in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence([x['attention_mask'] for x in batch], batch_first=True, padding_value=0)
    
    return {"input_ids": input_ids, "attention_mask": attention_mask}

In [93]:
def train_epoch(epoch, model, optimizer, tokenizer, train_data):
    num_batches = len(train_data)
    running_loss = 0
    with tqdm(total=num_batches, desc="Training", unit="batch", leave=False) as pbar:
        for _, batch in enumerate(train_data):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask = attention_mask, labels=input_ids)
            loss = outputs[0]
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
            pbar.set_description(f"Epoch: {epoch}")
            pbar.update(1)
        print(f"Training loss: {running_loss / num_batches:.4f}")
        torch.save(model.state_dict(), args['model_path'])

def train(model, tokenizer, optimizer, train_data, dev_data, args):
    model.train()
    print("Training started ...")
    for epoch in range(args['num_epochs']):
        train_epoch(epoch, model, optimizer, tokenizer, train_data)

## Main function

In [94]:

device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(args)

if (args['model'].startswith('gpt2')):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'pad_token': '<pad>', 
                                    'bos_token': '<bos>',
                                    'eos_token': '<eos>'})
    tokenizer.add_tokens(['<sep>'])
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)
if (not args['use_random_split']):
    train_dataset = TODDataset([args['train_path']], tokenizer)
    dev_dataset = TODDataset([args['dev_path']], tokenizer)

train_data = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, collate_fn=collate_fn)
print("Loaded Train Dataset with {} batches".format(len(train_dataset)))
dev_data = DataLoader(dev_dataset, batch_size=args['batch_size'], shuffle=True, collate_fn=collate_fn)
print("Loaded Dev Dataset with {} batches".format(len(dev_dataset)))

{'train_path': 'data/train.jsonl', 'dev_path': 'data/dev.jsonl', 'model': 'gpt2', 'model_path': 'cs1190444_cs1190673_model', 'num_epochs': 10, 'batch_size': 32, 'init_lr': 1e-05, 'use_random_split': False}
Loading data from data/train.jsonl
Number of examples: 30993
Loading data from data/dev.jsonl
Number of examples: 9272
Loaded Train Dataset with 30993 batches
Loaded Dev Dataset with 9272 batches


In [95]:
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=args['init_lr'])
train(model, tokenizer, optimizer, train_data, dev_data, args)

Training started ...


Epoch: 0: 100%|█████████▉| 968/969 [02:28<00:00,  6.35batch/s]

Training loss: 2.3956


Epoch: 1: 100%|█████████▉| 968/969 [02:28<00:00,  7.00batch/s]

Training loss: 1.0220


Epoch: 2: 100%|█████████▉| 968/969 [02:29<00:00,  6.54batch/s]

Training loss: 0.8577


Epoch: 3: 100%|██████████| 969/969 [02:29<00:00,  6.31batch/s]

Training loss: 0.7783


Epoch: 4: 100%|█████████▉| 968/969 [02:29<00:00,  6.50batch/s]

Training loss: 0.7247


Epoch: 5: 100%|█████████▉| 968/969 [02:29<00:00,  5.91batch/s]

Training loss: 0.6868


Epoch: 6: 100%|█████████▉| 968/969 [02:28<00:00,  6.59batch/s]

Training loss: 0.6569


Epoch: 7: 100%|█████████▉| 968/969 [02:29<00:00,  6.71batch/s]

Training loss: 0.6284


Epoch: 8: 100%|█████████▉| 968/969 [02:28<00:00,  6.76batch/s]

Training loss: 0.6118


Epoch: 9: 100%|█████████▉| 968/969 [02:28<00:00,  6.69batch/s]

Training loss: 0.5950


                                                              

In [102]:
test = "Change my Facebook status to say to say time for a road trip!"
test = "<bos> " + test.strip() + " <sep> "
encoded_input = tokenizer(test, return_tensors='pt', return_attention_mask=True)
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)
outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=40, do_sample=True,top_p=0.95, num_return_sequences=5)

outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for output in outputs:
    print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Change my Facebook status to say to say time for a road trip! <sep> Post_message ( message « time for a road trip » provider « Facebook » )
Change my Facebook status to say to say time for a road trip! <sep> Post_message ( provider « Facebook » )
Change my Facebook status to say to say time for a road trip! <sep> Post_message ( message « time for a road trip » provider « Facebook » )
Change my Facebook status to say to say time for a road trip! <sep> Post_message ( message « to say time for a road trip! » provider « Facebook » )
Change my Facebook status to say to say time for a road trip! <sep> Post_message ( message « time for a road trip » provider « Facebook » )
