In [None]:
! pip -q install transformers

In [None]:
!pip install --pre pytorch-ignite

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
from google.colab import drive\n
drive.mount('/content/drive/')

In [None]:
import torch
import gc 
import json
import random
import math
import logging
import os
import socket
from pprint import pformat
from datetime import datetime
from itertools import chain
from collections import defaultdict
from transformers import AutoModelWithLMHead, AutoTokenizer, AdamW, WEIGHTS_NAME, CONFIG_NAME

from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset
from ignite.engine import Engine, Events
from ignite.handlers import ModelCheckpoint, global_step_from_engine
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")
model.to(device)

In [None]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "[cls]", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'cls_token': '[CLS]', 'pad_token': '<pad>', 
                         'additional_special_tokens': ['<speaker1>', '<speaker2>', 'anythingElse', 'none', 'greet', 'nothingElse', 'listProjects', 'helpYou', 'listTasks', 'thanks', 'onIt'],
                        }

orig_num_tokens = len(tokenizer.encoder)
num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
if num_added_tokens > 0:
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)

In [None]:
optimizer = AdamW(model.parameters(), lr=6.25e-5, correct_bias=True)

In [None]:
def tokenize(obj):
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    if isinstance(obj, dict):
        return dict((n, tokenize(o)) for n, o in obj.items())
    return list(tokenize(o) for o in obj)

In [None]:
def build_inputs(history, reply, tokenizer, lm_labels=True, with_eos=True):
    """ Build a sequence of input from 2 segments: history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-2])
    sequence = [[bos]] + history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    #instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = [-100] * len(instance["input_ids"])
    #if lm_labels:
        #instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
    instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
    return instance

In [None]:
def pad_dataset(dataset, padding=0):
    PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
    max_l = max(len(x) for x in dataset["input_ids"])
    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
    return dataset

In [None]:
with open('/content/drive/My Drive/Chatbot/fullTrain.json') as t:
    train = json.load(t)

with open('/content/drive/My Drive/Chatbot/fullTest.json') as t:
    test = json.load(t)
    
tokenTrain = tokenize(train)

tokenTest = tokenize(test)

In [None]:
train['history'][4], train['history'][1][-3:]

In [None]:
datasets = {"train": defaultdict(list), "test": defaultdict(list)}

for i in range(len(tokenTrain['history'])):
    
    #instance_train = build_inputs([tokenTrain["lastInput"][i]], tokenTrain["reply"][i], tokenizer, True)
    #instance_test = build_inputs([tokenTest["lastInput"][i]], tokenTest["reply"][i], tokenizer, True)
    
    #### Only the last 3 occurences (the db has onlt the last 4 so...)
    instance_train = build_inputs(tokenTrain["history"][i][-3:], tokenTrain["reply"][i], tokenizer, True)
    instance_test = build_inputs(tokenTest["history"][i][-3:], tokenTest["reply"][i], tokenizer, True)
    
    for input_name, input_array in instance_train.items():
        datasets["train"][input_name].append(input_array)
    
    for input_name, input_array in instance_test.items():
        datasets["test"][input_name].append(input_array)

In [None]:
MODEL_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
tensor_datasets = {"train": [], "test": []}
for dataset_name, dataset in datasets.items():
    dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
    for input_name in MODEL_INPUTS:
        tensor = torch.tensor(dataset[input_name])
        tensor_datasets[dataset_name].append(tensor)

In [None]:
train_dataset, test_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["test"])

train_loader = DataLoader(train_dataset, sampler=None, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, sampler=None, batch_size=1, shuffle=False)

In [None]:
def make_logdir(model_name: str):
    """Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2"""
    # Code copied from ignite repo
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    logdir = os.path.join(
        '/content/drive/My Drive/Chatbot/runs', current_time + '_' + socket.gethostname() + '_' + model_name)
    return logdir

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
def train():
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(device) for input_tensor in batch)
        input_ids, lm_labels, token_type_ids = batch
        lm_loss, *_ = model(input_ids, token_type_ids=token_type_ids, labels=lm_labels)
        loss = (lm_loss * 1.0 )/ 8
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        if engine.state.iteration % 8 == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    
    trainer = Engine(update)
    
    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(device) for input_tensor in batch)
            input_ids, lm_labels, token_type_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids)
            
            lm_logits_flat_shifted = lm_logits[:, :-1].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[:, 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted), (lm_labels_flat_shifted)
        
    evaluator = Engine(inference)
    
    n_epochs = 2
    eval_before_start = True
    
    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(test_loader))
    if n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(test_loader))
    if eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(test_loader))
    
    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, 6.25e-5), (n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    
        
    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0], x[1]))}
    #Accuracy(output_transform=lambda x: (x[0][1], x[1][1]) Accuracy(thresholded_output_transform)
    metrics.update({"average_nll": metrics["nll"],
                    "average_accuracy": metrics["accuracy"]})
    
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)
    
    local_rank = -1
    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if local_rank in [-1, 0]:
        gc.collect()
        torch.cuda.empty_cache()
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir("gpt2")
        tb_logger = TensorboardLogger(log_dir)
        
        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" takes care of distributed encapsulation

        #torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)
    
    # Run the training
    trainer.run(train_loader, max_epochs=n_epochs)
    
    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if local_rank in [-1, 0] and n_epochs > 0:
        os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()

In [None]:
logger = logging.getLogger(__name__)

train()