In [1]:
import numpy as np
import os
import signal
import json
import torch

from parlai.core.agents import create_agent, create_agent_from_shared, get_agent_module
from parlai.core.worlds import create_task
from parlai.core.params import ParlaiParser
from parlai.core.utils import Timer, round_sigfigs, warn_once
from parlai.core.logs import TensorboardLogger
from parlai.scripts.build_dict import build_dict, setup_args as setup_dict_args
from parlai.core.distributed_utils import (
    sync_object, is_primary_worker, all_gather_list, is_distributed, num_workers
)
from parlai.scripts.build_pytorch_data import get_pyt_dict_file
from parlai.scripts.train_model import *

from transformers import Seq2SeqTrainer,Seq2SeqTrainingArguments, EncoderDecoderModel, BertTokenizerFast

In [2]:
task = "personachat"
model = "seq2seq"
batch_size = 1
lr = 1e-2
hidden_size = 128
args = f"""-m parlai.scripts.train_model -m {model} 
           -t {task} 
           -mf '/tmp/model' 
           -bs {batch_size} 
           -lr {lr} 
           -hs {hidden_size}"""

In [3]:
# Create dictionary of arguments
opt = setup_args().parse_args(args.split())

[ Main ParlAI Arguments: ] 
[  batchsize: 1 ]
[  datapath: C:\Users\snowy\Documents\6.864\final-project\Persona-Chat-6.864\data ]
[  datatype: train ]
[  download_path: C:\Users\snowy\Documents\6.864\final-project\Persona-Chat-6.864\downloads ]
[  hide_labels: False ]
[  image_mode: raw ]
[  multitask_weights: [1] ]
[  numthreads: 1 ]
[  show_advanced_args: False ]
[  task: personachat ]
[ ParlAI Model Arguments: ] 
[  dict_class: parlai.core.dict:DictionaryAgent ]
[  init_model: None ]
[  model: seq2seq ]
[  model_file: '/tmp/model' ]
[ Training Loop Arguments: ] 
[  dict_build_first: True ]
[  display_examples: False ]
[  eval_batchsize: None ]
[  evaltask: None ]
[  load_from_checkpoint: False ]
[  max_train_time: -1 ]
[  num_epochs: -1 ]
[  save_after_valid: False ]
[  save_every_n_secs: -1 ]
[  validation_cutoff: 1.0 ]
[  validation_every_n_epochs: -1 ]
[  validation_every_n_secs: -1 ]
[  validation_max_exs: -1 ]
[  validation_metric: accuracy ]
[  validation_metric_mode: None ]
[

In [4]:
# Agent is essentially an object for our 'Seq2Seq' model
agent = create_agent(opt)
# World is a BatchWorld object that stores the data and task at hand
# It contains a list (worlds) of DialogPartnerWorld objects
world = create_task(opt, agent)

[ no model with opt yet at: '/tmp/model'(.opt) ]
Dictionary: loading dictionary from '/tmp/model'.dict
[ num words =  18745 ]
[ Using CUDA ]




[ Loading existing model params from '/tmp/model' ]
[creating task(s): personachat]
[loading fbdialog data:C:\Users\snowy\Documents\6.864\final-project\Persona-Chat-6.864\data\Persona-Chat\personachat\train_self_original.txt]


  warn_once("LR scheduler is different from saved. Starting fresh!")


Below I show how we run matches with "worlds" and "agents". I have made print statements inside parlai.core.worlds and modules to show what's happening internally. 

It seems agent 0 is simply data extraction and agent 1 is the model (receives observation from agent 0). The data is ordered such that batch 1 is persona + first dialogue line and batch two just provides the next observed lines (but as you can see from the input lines being printed for agent 1 the previous dialogue are still in the sequence). I'm pretty sure everything is saved in agent.batch_observations.

You can look in data/train_both_original.txt for the dialogue. 

# Preprocess Bert Data

In [6]:
train_text = []
train_labels = []
b = 0

# This is how we loop through batches
with world:
    while world.get_total_epochs() < 1:
        b += 1
        # do one example / batch of examples
        world.parley()
        
        train_text.append(world.agents[1].observation['text'])
        train_labels.append(world.agents[1].observation['labels_choice'])
        
        #print("Input: ", world.agents[1].observation['text'])
        #print("Label: ", world.agents[1].observation['labels_choice'])
        
        if (b % 1000) == 0:
            print(f"Index = {b}")

Index = 1000
Index = 2000
Index = 3000
Index = 4000
Index = 5000
Index = 6000
Index = 7000
Index = 8000
Index = 9000
Index = 10000
Index = 11000
Index = 12000
Index = 13000
Index = 14000
Index = 15000
Index = 16000
Index = 17000
Index = 18000
Index = 19000
Index = 20000
Index = 21000
Index = 22000
Index = 23000
Index = 24000
Index = 25000
Index = 26000
Index = 27000
Index = 28000
Index = 29000
Index = 30000
Index = 31000
Index = 32000
Index = 33000
Index = 34000
Index = 35000
Index = 36000
Index = 37000
Index = 38000
Index = 39000
Index = 40000
Index = 41000
Index = 42000
Index = 43000
Index = 44000
Index = 45000
Index = 46000
Index = 47000
Index = 48000
Index = 49000
Index = 50000
Index = 51000
Index = 52000
Index = 53000
Index = 54000
Index = 55000
Index = 56000
Index = 57000
Index = 58000
Index = 59000
Index = 60000
Index = 61000
Index = 62000
Index = 63000
Index = 64000
Index = 65000


In [9]:
# Save to avoid reprocessing in the future
import csv 
with open('data//Persona-Chat//bert-personachat//train_text.csv','w') as f:
    wr = csv.writer(f)
    wr.writerows(train_text)

with open('data//Persona-Chat//bert-personachat//train_labels.csv','w') as f:
    wr = csv.writer(f)
    wr.writerows(train_labels)

# Dataset Class

In [7]:
class PersonachatDataset(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.text = tokenizer(text)
        self.labels = tokenizer(labels)

    def __getitem__(self, idx):
        if len(self.text[idx]) > 512:
            return  __getitem__(self, idx+1)
        item = {key: torch.tensor(val[idx]) for key, val in self.text.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [31]:
import datasets
from datasets import Dataset
dataset = Dataset.from_dict({"text" : train_text,
                             "response" : train_labels})
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Training Model

In [40]:
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("distilbert-base-uncased", "bert-base-uncased")
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
bert2bert.config.max_length = opt['attention_length']
bert2bert.config.min_length = 1
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer

In [33]:
encoder_max_length = 512
decoder_max_length = opt['attention_length']

def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["response"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [34]:
dataset = dataset.select(range(32))
# batch_size = 16
batch_size=4

dataset = dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["text" ,"response"]
)

HBox(children=(FloatProgress(value=0.0, max=16400.0), HTML(value='')))




In [None]:
dataset

In [35]:
# Training and eval batch size
batch_size = 1
# Eval on validation data every 
eval_steps = 4

training_args = Seq2SeqTrainingArguments(
    output_dir="bert-models",
    predict_with_generate=True,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #fp16=True, 
    logging_steps=2,
    save_steps=10
    #eval_steps=eval_steps
)

In [38]:
trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_args,
    train_dataset=dataset,
    #eval_dataset=val_data,
)

In [39]:
trainer.train()

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 8.00 GiB total capacity; 6.07 GiB already allocated; 1.81 MiB free; 6.38 GiB reserved in total by PyTorch)