<a href="https://colab.research.google.com/github/be-mich/LIGN-167-Project/blob/main/Base_model_(George_Bot).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook creates a base model which will be used in comparison with the other models. 

In [None]:
#setup 
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [None]:
# all the imports

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

!pip install transformers

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    #SGD, #try other optimizer- not available in transformers
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter
# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[?25l[K     |                                | 10 kB 25.0 MB/s eta 0:00:01[K     |▏                               | 20 kB 30.8 MB/s eta 0:00:01[K     |▎                               | 30 kB 34.2 MB/s eta 0:00:01[K     |▍                               | 40 kB 37.6 MB/s eta 0:00:01[K     |▌                               | 51 kB 28.7 MB/s eta 0:00:01[K     |▌                               | 61 kB 32.1 MB/s eta 0:00:01[K     |▋                               | 71 kB 28.7 MB/s eta 0:00:01[K     |▊                               | 81 kB 29.9 MB/s eta 0:00:01[K     |▉                               | 92 kB 32.1 MB/s eta 0:00:01[K     |█                               | 102 kB 30.7 MB/s eta 0:00:01[K     |█                               | 112 kB 30.7 MB/s eta 0:00:01[K     |█                               | 122 kB 30.7 MB/s eta 0:00:01[K     |█▏                              | 133 kB 30.7

In [None]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

Import Seinfeld scripts data from google drive (previously obtained from Kaggle)

In [None]:
! gdown https://drive.google.com/uc?id=1wSKl_642G6lsk4AmrYXpa_8XWb4hcY_B

In [None]:
seinfeld_scripts = pd.read_csv("./scripts.csv")

In [None]:
seinfeld_scripts.head()

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0
1,1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0
2,2,GEORGE,Are you through?,1.0,S01E01,1.0
3,3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0
4,4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0


In [None]:
#make df
type(seinfeld_scripts)

pandas.core.frame.DataFrame

In [None]:
#check for empty dialogue
seinfeld_scripts[seinfeld_scripts['Dialogue'].isnull()==True]


Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
13529,13529,** Pies - Just in case you did not know what t...,,7.0,S04E07,4.0
14938,14938,"[On the bright side, Kramer and George arrive ...",,12.0,S04E12,4.0
18353,18353,(The show begins. There are three different se...,,24.0,S04E24,4.0
18354,18354,(Jerry's doing his stand-up routine at a comed...,,24.0,S04E24,4.0
18355,18355,"We see the title 'Jerry', then, sitting at the...",,24.0,S04E24,4.0
42939,42939,Definitions of several items in the Chicken Ro...,,8.0,S08E08,8.0
45847,45847,HAL,,18.0,S08E18,8.0
49651,49651,Notice,,8.0,S09E08,9.0
50013,50013,The definition of Sari or Saree is,,8.0,S09E08,9.0
53142,53142,MONTAGE,,18.0,S09E18,9.0


In [None]:
#drop rows with missing values

seinfeld_scripts = seinfeld_scripts.dropna().reset_index(drop=True)
print(len(seinfeld_scripts))

54606


Clean up data 

In [None]:
#can get rid of some columns- we only need character and dialogue
seinfeld_scripts = seinfeld_scripts.drop(columns=['Unnamed: 0','EpisodeNo','SEID','Season'])

In [None]:
seinfeld_scripts.head()

Unnamed: 0,Character,Dialogue
0,JERRY,Do you know what this is all about? Do you kno...
1,JERRY,"(pointing at Georges shirt) See, to me, that b..."
2,GEORGE,Are you through?
3,JERRY,"You do of course try on, when you buy?"
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall..."


In [None]:
len(seinfeld_scripts)

54606

In [None]:
sum(seinfeld_scripts.Character == 'GEORGE')

9708

In [None]:
NAME = 'GEORGE'

Restructure dataframe so that we have George's dialogue as a response, and we have 7 previous exchanges of dialogue as context. The context dialogue can include dialogue from other character's, but the response is only including George's.

In [None]:
contexted = []
#context window of size 7
n = 7

for i in seinfeld_scripts[seinfeld_scripts.Character == NAME].index:
    if i < n:
        continue
    row = []
    prev = i - 1 - n #subtract 1 so row contains current and 7 previous responses
    for j in range(i, prev, -1):
        row.append(seinfeld_scripts.Dialogue[j])
    contexted.append(row)
columns = ['response','context']
columns = columns + ['context/' + str(i) for i in range(n-1)]

formatted_df = pd.DataFrame.from_records(contexted, columns=columns)
    

In [None]:
formatted_df.sample(6)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
4496,"Oh, here we go...","You know, I was just thinking. The four of us ...","Oh, yeah. You wish you had this coat.","You're pretty comfortable up there eh, Bubble ...","Hehehehehe (higher tone laugh, though still qu...","Jerry get off of me. Get off of ME! Get off, G...",Where's the heat in this car? Come on Elaine w...,Uh huh.
2590,"(Quickly making it up) Well, possibly. I wrote...","What about you, George? Have you written anyth...","Yeah, it's one of my favorites.","The parakeet in the mirror. That's a good one,...","(To Jerry, laughing about one of his bits) The...",No.,You really think so?,Better.
5863,"(to Jimmy) Wh...Where d'you get'em?""","""Yeah! They isolate the muscles. The muscle ha...","""Plyometric?""","""They're Plyometric.(sp?)""","""Oh yeah! Jimmy couldn't jump at all before he...","""Yeah,yeah yeah yeah! I've seen these.....they...","""These are Jimmy's training shoes.""","""Yeah"""
4975,It became very clear to me sitting out there t...,(pointing to napkins; to Elaine) Could I get n...,Why did it all turn out like this for me? I ha...,What is it that isn't working?,"It's not working, Jerry. It's just not working...","Oh, the beach.",I went to the beach. (Jerry and Elaine exchang...,Speaking of having it all ... Where were you?
9585,"They're all the same. You go, you sit, you eat...",Not this movie.,I've been to the movies.,To see a movie.,Nah - what for?,So what are you doing later? You want to go to...,You know what? I don't think so. I'm going to ...,It's up to you.
5611,"I guess she did. He's gone, now I'm the man.","Wow, she rearranged her whole life for you.",That's right. It's just me and her.,So she kicked him out of the apartment.,Oh. Oh!,I asked Scott to move out.,"Well, look who's here.","They don't need any water, so you don't have t..."


In [None]:
#see if i have  na

formatted_df.isnull().values.any()

False

In [None]:
#preprocess tokenizer
!pip install transformers
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")



In [None]:
#shuffle and split data

from sklearn.model_selection import train_test_split
trn_df, test_df = train_test_split(formatted_df, test_size = 0.2, shuffle=True)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
8260,Whatever!!. . . . Oh oh oh this is unbelievabl...,"Hey George, you want this cup holder mounted o...",That is what I want.,If that is what you want.,"Yeah, like that.",Like that?,Maybe like this.,That thick?
5566,I love it! This is fantastic! Look at this cou...,Do you like it?,Wow.,"Well, here we are. This is the place.","Yeah, well, if you're getting him anything for...","Oh, he did some dental work for me and he didn...",Why'd you get him a gift anyway?,"I don't know, I guess I'll just get invited up..."
3348,Would you like a Pepto-Bismol? I keep them in ...,What?,Excuse me for a second. (Gets up and runs to t...,What?,"Alright. The script. Now, I've read this thing...","Aw, that's a fun age. (Jerry looks at George d...","Yeah, she just turned fifteen last week.","Oh, you have a daughter?"
3032,Fine.,All right. I'll get pea pods and you can't hav...,"All right, look. I'm getting the Chow Fung. Yo...",Who says I want a noodle?,What kind of noodle do you want?,Well I don't want a big flat noodle.,It's a big flat noodle.,"What do you mean, a broad noodle?"
1981,I thought were a team.,You think I'm going to spend my life with some...,I went to the hardware store interview.,I bin thinkin about it. You got no job. You go...,Why?,It's over buddy. Done. Finished. So long. Good...,Why? Is there anything wrong?,I would prefa' it if ya' didn'.


In [None]:
# create dataset suitable for our model
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
# Cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [None]:
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")



Downloading:   0%|          | 0.00/335M [00:00<?, ?B/s]

Now for the training and evaluation functions. The evaluation is conducted using the ADAMW optimizer, which is used for all future models. The training function is where the DataLoader occurs, which will be experimented with later.

In [None]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
# data loader
#base model is RandomSampler
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    #optimizer = SGD(optimizer_grouped_parameters, lr=args.learning_rate, momentum=0.9)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

Main code runner

In [None]:
# Main runner

def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

In [None]:
main(trn_df, test_df)

02/25/2022 22:19:08 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7fb64696cd50>
02/25/2022 22:19:08 - INFO - __main__ -   Creating features from dataset file at cached
02/25/2022 22:19:32 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
02/25/2022 22:19:33 - INFO - __main__ -   ***** Running training *****
02/25/2022 22:19:33 - INFO - __main__ -     Num examples = 7764
02/25/2022 22:19:33 - INFO - __main__ -     Num Epochs = 3
02/25/2022 22:19:33 - INFO - __main__ -     Instantaneous batch size per GPU = 4
02/25/2022 22:19:33 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 4
02/25/2022 22:19:33 - INFO - __main__ -     Gradient Accumulation steps = 1
02/25/2022 22:19:33 - INFO - __main__ -     Total optimization steps = 5823


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1941 [00:00<?, ?it/s]



Iteration:   0%|          | 0/1941 [00:00<?, ?it/s]

02/25/2022 22:46:27 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-3500
02/25/2022 22:46:44 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-3500


Iteration:   0%|          | 0/1941 [00:00<?, ?it/s]

02/25/2022 23:04:28 - INFO - __main__ -    global_step = 5823, average loss = 2.1712543867757472
02/25/2022 23:04:28 - INFO - __main__ -   Saving model checkpoint to output-small
02/25/2022 23:04:42 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
02/25/2022 23:04:44 - INFO - __main__ -   Creating features from dataset file at cached
02/25/2022 23:04:59 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
02/25/2022 23:04:59 - INFO - __main__ -   ***** Running evaluation  *****
02/25/2022 23:04:59 - INFO - __main__ -     Num examples = 1941
02/25/2022 23:04:59 - INFO - __main__ -     Batch size = 4


Evaluating:   0%|          | 0/485 [00:00<?, ?it/s]

02/25/2022 23:06:03 - INFO - __main__ -   ***** Eval results  *****
02/25/2022 23:06:03 - INFO - __main__ -     perplexity = tensor(7.1949)


{'perplexity_': tensor(7.1949)}

Now let's chat and generate a conversation.
  I added a "bad words list" where I restrict the model from outputting "!" in an attempt to prevent the random outputting of punctuation. This issue is resolved in later notebooks.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('output-small')

# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=300,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=0,       
        do_sample=True, 
        top_k=50, 
        top_p=0.7,
        temperature = 0.8,
        #add bad words list
        bad_words_ids = [tokenizer(bad_word).input_ids for bad_word in ["!"]]
    )
    
    # pretty print last ouput tokens from bot
    print("GeorgeBot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

NameError: ignored