In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [None]:
!pip install transformers
import transformers



In [None]:
# all the imports

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [None]:
df=pd.read_csv("the-office_lines.csv")

In [None]:
df.head(50)

Unnamed: 0.1,Unnamed: 0,Character,Line,Season,Episode_Number
0,0,Michael,All right Jim. Your quarterlies look very goo...,1,1
1,1,Jim,"Oh, I told you. I couldn’t close it. So…",1,1
2,2,Michael,So you’ve come to the master for guidance? Is...,1,1
3,3,Jim,"Actually, you called me in here, but yeah.",1,1
4,4,Michael,"All right. Well, let me show you how it’s don...",1,1
5,5,Michael,"[on the phone] Yes, I’d like to speak to your...",1,1
6,6,Michael,"I’ve, uh, I’ve been at Dunder Mifflin for 12 ...",1,1
7,7,Pam,Well. I don’t know.,1,1
8,8,Michael,"If you think she’s cute now, you should have ...",1,1
9,9,Pam,What?,1,1


In [None]:
sum(df["Character"]=="Stanley")

750

In [None]:
sum(df["Season"]==1)+sum(df["Season"]==2)+sum(df["Season"]==3)+sum(df["Season"]==4)
#df=df.head(21879)

21879

In [None]:
sum(df["Character"]=="Michael")

11806

In [None]:
sum(df["Character"]=="Pam")

5264

In [None]:
sum(df["Character"]=="Dwight")

7393

In [None]:
sum(df["Character"]=="Andy")

3933

In [None]:
#The character Michael has 11806 total lines in this whole script (9 seasons) script
#For 4 seasons Michael has 6376 total lines

In [None]:
df.drop("Season",axis=1,inplace=True)

In [None]:
df.drop("Episode_Number",axis=1,inplace=True)

In [None]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
df

Unnamed: 0,Character,Line
0,Michael,All right Jim. Your quarterlies look very goo...
1,Jim,"Oh, I told you. I couldn’t close it. So…"
2,Michael,So you’ve come to the master for guidance? Is...
3,Jim,"Actually, you called me in here, but yeah."
4,Michael,"All right. Well, let me show you how it’s don..."
...,...,...
58716,Creed,It all seems so very arbitrary. I applied for...
58717,Meredith,I just feel lucky that I got a chance to shar...
58718,Phyllis,I’m happy that this was all filmed so I can r...
58719,Jim,I sold paper at this company for 12 years. My...


In [None]:
# dataframe is all cleaned and we have only characters and lines now

In [None]:
df=df.rename(columns={"Character":"name","Line":"line"})
for item in df:
  print(item)

name
line


In [None]:
CHARACTER_NAME="Michael"

In [None]:
contexted = []

# context window of size 7
n = 7

for i in df[df.name == CHARACTER_NAME].index:
  if i < n:
    continue
  row = []
  prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 7 previous responces  
  for j in range(i, prev, -1):
    row.append(df.line[j])
  contexted.append(row)

columns = ['response', 'context'] 
columns = columns + ['context/' + str(i) for i in range(n - 1)]

df = pd.DataFrame.from_records(contexted, columns=columns)


In [None]:
df.sample(10)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
7820,Hey hey… what’s up Chuck?,Here’s the thing. What I wish for you is th...,"Oh, agreed, mmm, except…","And you are not going to, either.",Obviously.,I am not gonna do this.,"Alright, Michael Scott Paper Company. You wa...",Okay. [keeps eyes open]
533,"Nah, it’s not. it’s spppplllibbb",Regular coffee is fine.,It’s like a slang for Starbucks. They’re all ...,What?,Wait a second. I should have spotted another ...,I guess a cup of coffee would be great.,"So if you need anything else, something to ma...","This is my conference room. So please, uh, ma..."
10731,[in an old man mobster voice] Hey. Hey. I got...,Sunday church service… it’s been a few years....,[handing out cards] For all your paper and pr...,Does the Nard-dog want Nard-pups? Yeah. I wan...,"[looking at Cece, under his breath] Ah, man. ...",I invited everyone in the office because it’s...,"Oh, yes. [mimicking smoking] Doobie-doobie do...",Hope you brought your pipes. We’re about to s...
5026,"Oh hey, I know you… Elizabeth?",Hello Michael.,He’s happy because he’s insane. You know what...,Look how happy he is.,"No, don’t give him… just, did you hear anythi...",[feeding a squirrel] I’m giving him a peanut.,"OK everybody, listen up. Thank you for coming...","I don’t know what your deal is, but he’s mine..."
8051,"[Michael, Ryan and Pam sitting at restaurant,...",Damn it.,I had fish yesterday.,Cooper’s.,Let me take you and your whole company out fo...,I do too.,I want a truce.,"If you want a truce, I will give you a truce."
9478,What is the world’s largest ocean?,Yes.,[entering on a Segway Scooter] I see you’ve m...,Hello.,[Dwight using a robotic voice over the PA sys...,Merry Christmas Erin! Take it away boys! [dan...,Twelve drummers drumming. [marching drum band...,Oh my God!
4387,No!,Michael—,I got you… jade earrings.,"Oh, God.",Six percent? After all we’ve been through?,Right now we can offer you a 6% raise.,Pippity poppity.,"Hi, Toby. [clears throat] First— [Michael cle..."
1934,Thank you. Did you get all dark meat like I ...,Here you go.,Where is my cornbread?,What is wrong with,"Yes, you did. What is wrong with you?","No, I didn’t.",You just said “part of your duties are to” so...,What?
2452,Yeah.,Michael?,He’s kidding. Dwight was kidding and I don’t ...,This is karma because of what he did to Jenni...,Yes.,Are we out of jobs?,What the hell is going on here?,"He will never act again. Also, this branch is..."
10194,"Yeah, I just remembered that I have to go to ...","I am, but I, I gave my clubs away. I swear to...",[to Michael] You’re gonna wanna look at the d...,Yeah I think it’ll be a nice trip. We’re gonn...,I love [leans over Michael’s desk to look at ...,"Okay, weirdo.",[loudly] I need you to sign this! So bad!,"Oh, my God. Look at how cheap street level ro..."


In [None]:
trn_df, val_df = train_test_split(df, test_size=0.1)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
2735,Yeah. Sorry you had to hear that. It was a lo...,Has she been on the phone the whole time?,"Okay, well, I really think that… [dial tone b...","[on the phone] Michael, I’m gonna get going. ...","No? okay, fine.","It’s not me, either.","No, no! Your instinct. That… Your gut reactio...",Excuse me?
9508,They call it Scranton!,in the Electric City!,So check out how we live,"Yo, Mike, our town is dope and pretty.",[extended Lazy Scranton video] Sittin’ in my ...,Shut up. Shut up.,wants to live.,Shut –
8326,"Welcome, welcome! Cafe Disco. I am Michael Sc...",Oh yeah. you’ve got a knot in your crest. Thi...,Right… mmmm… right there.,Tell me where it hurts.,"That feels good, Dwight.",It’s better than I imagined it!,"I hear it, too, Boss.",Cafe Disco is dead but I can still hear the m...
9719,Thank you! Wish me luck!,"Good luck, Pam!",Good luck!,Here we go!,"Okay, I can’t find my keys! I cannot find m...",[as he holds up the tape measure with his in...,I didn’t know we had a tape measure.,"No, no, no. Dwight, let Jim do that, please."
7850,[yelling from Parking lot] Alright then ever...,"No, no no no. You’re done, Michael.",[to everyone] Well here we are… I would just...,I always thought Michael got a bad rap. He’s...,"Come on man, let’s, let’s go.",Hank? You really think Hank is going to be ...,Okay Michael.,not if you’re starting your own paper compan...


In [None]:
# create dataset suitable for our model
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
# Cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [None]:
#BUILDING A MODEL

In [None]:
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")



In [None]:
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [None]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 1
        self.per_gpu_eval_batch_size = 1
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 4
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

In [None]:
#TRAIN AND EVALUATE

In [None]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [None]:
# Main runner

def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        #device = torch.device("cpu")
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

In [None]:
main(trn_df, val_df)

02/17/2022 00:35:29 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7f01f18665d0>
02/17/2022 00:35:29 - INFO - __main__ -   Creating features from dataset file at cached
02/17/2022 00:35:45 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
02/17/2022 00:35:45 - INFO - __main__ -   ***** Running training *****
02/17/2022 00:35:45 - INFO - __main__ -     Num examples = 10620
02/17/2022 00:35:45 - INFO - __main__ -     Num Epochs = 4
02/17/2022 00:35:45 - INFO - __main__ -     Instantaneous batch size per GPU = 1
02/17/2022 00:35:45 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 1
02/17/2022 00:35:45 - INFO - __main__ -     Gradient Accumulation steps = 1
02/17/2022 00:35:45 - INFO - __main__ -     Total optimization steps = 42480


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10620 [00:00<?, ?it/s]

02/17/2022 00:41:50 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-3500
02/17/2022 00:42:05 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-3500
02/17/2022 00:48:13 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-7000
02/17/2022 00:48:27 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-7000
02/17/2022 00:54:36 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-10500
02/17/2022 00:54:50 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-10500


Iteration:   0%|          | 0/10620 [00:00<?, ?it/s]

02/17/2022 01:00:57 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-14000
02/17/2022 01:01:10 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-14000
02/17/2022 01:07:16 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-17500
02/17/2022 01:07:29 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-17500
02/17/2022 01:13:37 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-21000
02/17/2022 01:13:51 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-21000


Iteration:   0%|          | 0/10620 [00:00<?, ?it/s]

02/17/2022 01:19:52 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-24500
02/17/2022 01:19:55 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-24500
02/17/2022 01:25:59 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-28000
02/17/2022 01:26:03 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-28000
02/17/2022 01:32:05 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-31500
02/17/2022 01:32:09 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-31500


Iteration:   0%|          | 0/10620 [00:00<?, ?it/s]

02/17/2022 01:38:11 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-35000
02/17/2022 01:38:15 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-35000
02/17/2022 01:44:21 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-38500
02/17/2022 01:44:25 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-38500
02/17/2022 01:50:31 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-42000
02/17/2022 01:50:35 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-42000
02/17/2022 01:51:25 - INFO - __main__ -    global_step = 42480, average loss = 2.2371146610739823
02/17/2022 01:51:25 - INFO - __main__ -   Saving model checkpoint to output-small
02/17/2022 01:51:35 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
02/17/2022 01:51:38 - INFO - __main__ -   Creating features from dataset file at cached
0

Evaluating:   0%|          | 0/1181 [00:00<?, ?it/s]

02/17/2022 01:52:07 - INFO - __main__ -   ***** Eval results  *****
02/17/2022 01:52:07 - INFO - __main__ -     perplexity = tensor(6.8116)


{'perplexity_': tensor(6.8116)}

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('output-small')



In [None]:
# Let's chat for 15 lines
for step in range(15):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature=0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Michael: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))