# Building a Discord Bot that Talks like Chandler Bing from Friends

## Installing and Importing Necessary Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip -q install transformers

[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
[K     |████████████████████████████████| 596 kB 45.9 MB/s 
[K     |████████████████████████████████| 6.8 MB 39.2 MB/s 
[K     |████████████████████████████████| 895 kB 47.9 MB/s 
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
[?25h

In [None]:
import os
os.chdir("/content/drive/My Drive")

In [None]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

## Getting Data from Kaggle

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download ryanstonebraker/friends-transcript -f "friends_quotes.csv"

friends_quotes.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!head friends_quotes.csv.zip

PK-    ��PH�r���������  friends_quotes.csv  �"]     ~     ��ْ�X�%�~��#�H^����&)3��J��*���@r�D�0�b�H�O������`�`�=�R��؃�˗�u�ʋ��$e>w�+��S��Y%U�������_y1��K�y��|̳do�O��k��Ue�&�3���X��u�
7(�,�VI���<�\���x�����2_�hY��h���6�V����\���g�v�n�7X��0�������K^W�LT�x�^�Ǳ̫*���w�!m����d���n���U��Sy�S���Q�,WUđ���A7���h��2���.��U��L���ɥ�R����ܤ?|j�����ԝ��8����.���*N��"Ϻ_�2��h0���e��y6�i��J.Z`���X�"���h����9y7��){�(�W2o����wE��},s�~�b7�绨pi�s,���Q�݂73���|~�V~�T�<ڸ|���Z�:O�LF-�:��e��n�~�����]^g�2J�{g����n<|9�h_pWe�E��u4�g�Q����+g�<O�|���j�����N�r�(_�Y�p�+�x�s+3���N~��2�*N�]���n.����P6���_Ɂq�j}��rY۹\x-�xu�Cʩ�d$i�����f�������f�gn<�Kr����mVVN�/�ۓ�͓��UL�΅on:����l����t?��r����E�%Ŋ�QY��N�k�9��(�Ӳ�%��S���>�?x4q�����ai���#kL�M�C�&������n�k��I��7�o.9^�ߓ{g�,k�ٷ��\��x|K�[醝�ȃ�EV�'X��.Z��CCy�d����3����4)��j���Y̸�3��|�A��s�g^���z�΋*Y$n~}HdI��:����cPie���N��+�5���D{�+]����� �72Y����Sf�U	4���OԿU!�p.U	5�]&{|�rs�bL2d|,��,��T^��ʵ̡H�ʕU�����i�HXپ\�'�v24

In [None]:
data = pd.read_csv('friends_quotes.csv.zip')

In [None]:
data.head()

Unnamed: 0,author,episode_number,episode_title,quote,quote_order,season
0,Monica,1.0,Monica Gets A Roommate,There's nothing to tell! He's just some guy I ...,0.0,1.0
1,Joey,1.0,Monica Gets A Roommate,"C'mon, you're going out with the guy! There's ...",1.0,1.0
2,Chandler,1.0,Monica Gets A Roommate,"All right Joey, be nice. So does he have a hum...",2.0,1.0
3,Phoebe,1.0,Monica Gets A Roommate,"Wait, does he eat chalk?",3.0,1.0
4,Phoebe,1.0,Monica Gets A Roommate,"Just, 'cause, I don't want her to go through w...",4.0,1.0


In [None]:
data = data.drop(['episode_number', 'episode_title', 'quote_order', 'season'], axis=1)

In [None]:
data.head()

Unnamed: 0,author,quote
0,Monica,There's nothing to tell! He's just some guy I ...
1,Joey,"C'mon, you're going out with the guy! There's ..."
2,Chandler,"All right Joey, be nice. So does he have a hum..."
3,Phoebe,"Wait, does he eat chalk?"
4,Phoebe,"Just, 'cause, I don't want her to go through w..."


In [None]:
data = data.rename(columns={'author': 'Character', 'quote': 'Line'})

In [None]:
data.head()

Unnamed: 0,Character,Line
0,Monica,There's nothing to tell! He's just some guy I ...
1,Joey,"C'mon, you're going out with the guy! There's ..."
2,Chandler,"All right Joey, be nice. So does he have a hum..."
3,Phoebe,"Wait, does he eat chalk?"
4,Phoebe,"Just, 'cause, I don't want her to go through w..."


In [None]:
sum(data.Character == 'Chandler')

7488

In [None]:
CHARACTER_NAME = 'Chandler'

In [None]:
contexted = []

n = 7

for i in data[data.Character == CHARACTER_NAME].index:
  if i < n:
    continue
  row = []
  prev = i - 1 - n # additionally substracting 1, so row will contain current responce and 7 previous responces  
  for j in range(i, prev, -1):
    row.append(data.Line[j])
  contexted.append(row)

columns = ['response', 'context'] 
columns = columns + ['context/' + str(i) for i in range(n - 1)]

df = pd.DataFrame.from_records(contexted, columns=columns)

In [None]:
df.head(6)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
0,"Alright, so I'm back in high school, I'm stand...",Sounds like a date to me.,"Okay, everybody relax. This is not even a date...","Just, 'cause, I don't want her to go through w...","Wait, does he eat chalk?","All right Joey, be nice. So does he have a hum...","C'mon, you're going out with the guy! There's ...",There's nothing to tell! He's just some guy I ...
1,"Then I look down, and I realize there's a phon...","Oh, yeah. Had that dream.","Alright, so I'm back in high school, I'm stand...",Sounds like a date to me.,"Okay, everybody relax. This is not even a date...","Just, 'cause, I don't want her to go through w...","Wait, does he eat chalk?","All right Joey, be nice. So does he have a hum..."
2,That's right.,Instead of...?,"Then I look down, and I realize there's a phon...","Oh, yeah. Had that dream.","Alright, so I'm back in high school, I'm stand...",Sounds like a date to me.,"Okay, everybody relax. This is not even a date...","Just, 'cause, I don't want her to go through w..."
3,"All of a sudden, the phone starts to ring. Now...",No.,Never had that dream.,That's right.,Instead of...?,"Then I look down, and I realize there's a phon...","Oh, yeah. Had that dream.","Alright, so I'm back in high school, I'm stand..."
4,"Finally, I figure I'd better answer it, and it...",And they weren't looking at you before?!,"All of a sudden, the phone starts to ring. Now...",No.,Never had that dream.,That's right.,Instead of...?,"Then I look down, and I realize there's a phon..."
5,Cookie?,I just feel like someone reached down my throa...,"Are you okay, sweetie?","This guy says hello, I wanna kill myself.",(mortified) Hi.,"Finally, I figure I'd better answer it, and it...",And they weren't looking at you before?!,"All of a sudden, the phone starts to ring. Now..."


In [None]:
trn_df, val_df = train_test_split(df, test_size=0.1)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
1371,"Look, Im telling you this is just like my par...",Why would you start again after chewing all th...,"Well, actually, yesterday I was smoking again....",Youre smoking again?!,(jumps back and points at the cigarette) Oh my...,Chandler!!,Chandler what are you doing?!,Very bad.
4931,"Hey, yknow what we can do? Yknow, now that w...","Here, let me make the milk, Im up anyway.",It got interesting! Damn you Oprah!,With a wok? (Chandlers holding a wok.) I thou...,"Shhhhhhhhh!!! (Monica enters) Im sorry, I tho...","(To Joey) I know, I wasnt finished. (Joey mot...",Ross. I was thinking we could just go down the...,"(standing at the edge of the roof) Yeah, I gue..."
496,"Hey, you guys all know what you want to do.",Career counselor?,"Well, I have an appointment to see Dr. Robert ...","Where are you going, Mr. Suity-Man?","No. But don't worry, I'm sure they're still th...",Can you see my nipples through this shirt?,OK. (pause) Oh Monica! Guess what!,"Thanks, Phoebe. But I just don't really see my..."
5469,Were on a semi-first name basis.,Oh then you know each other.,Yeah!,Hey-hey! Bing? Was that Bob from six you were ...,(laughs) Toby!,"If I see him, Ill ask.",Hey! Hows my pal Toby doing today?,Hey Bob.
2022,Were you so late because you were burring this...,"Okay, it was just me and her at the back of th...","(just Phoebe) No, answer his.","Oh, which museum?","Oh, yeah! How did you meet her?","Oh, on a date. Yeah, I met this girl on the tr...",Where were you?,15? (Joey nods again) Your personal best! (Ros...


In [None]:
# creating dataset suitable for our model
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
# cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # checking if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

## Building Model

In [None]:
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/335M [00:00<?, ?B/s]

In [None]:
# configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [None]:
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 4
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

## Training and Evaluating

In [None]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # taking care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    


    # preparing optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # checking if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # loading in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training 
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training 
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # training
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # checking if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # setting global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # adding for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # skipping past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # updating learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # saving model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # taking care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # evaluation
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [None]:
# Main runner

def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

   
    if args.do_train:
        # creating output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # saving a trained model, configuration and tokenizer using `save_pretrained()`.
        
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # taking care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # loading a trained model and vocabulary that we have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # reducing logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

## Running the Main Function

In [None]:
main(trn_df, val_df)

02/12/2022 06:08:43 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7fc6c33ec8d0>
02/12/2022 06:08:43 - INFO - __main__ -   Creating features from dataset file at cached
02/12/2022 06:09:06 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
02/12/2022 06:09:07 - INFO - __main__ -   ***** Running training *****
02/12/2022 06:09:07 - INFO - __main__ -     Num examples = 6737
02/12/2022 06:09:07 - INFO - __main__ -     Num Epochs = 4
02/12/2022 06:09:07 - INFO - __main__ -     Instantaneous batch size per GPU = 4
02/12/2022 06:09:07 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 4
02/12/2022 06:09:07 - INFO - __main__ -     Gradient Accumulation steps = 1
02/12/2022 06:09:07 - INFO - __main__ -     Total optimization steps = 6736


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1684 [00:00<?, ?it/s]



Iteration:   0%|          | 0/1684 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1684 [00:00<?, ?it/s]

02/12/2022 06:41:12 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-3500
02/12/2022 06:41:29 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-3500


Iteration:   0%|          | 0/1684 [00:00<?, ?it/s]

02/12/2022 07:10:54 - INFO - __main__ -    global_step = 6736, average loss = 2.03546781513194
02/12/2022 07:10:54 - INFO - __main__ -   Saving model checkpoint to output-small
02/12/2022 07:11:04 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
02/12/2022 07:11:07 - INFO - __main__ -   Creating features from dataset file at cached
02/12/2022 07:11:08 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
02/12/2022 07:11:08 - INFO - __main__ -   ***** Running evaluation  *****
02/12/2022 07:11:08 - INFO - __main__ -     Num examples = 749
02/12/2022 07:11:08 - INFO - __main__ -     Batch size = 4


Evaluating:   0%|          | 0/187 [00:00<?, ?it/s]

02/12/2022 07:11:40 - INFO - __main__ -   ***** Eval results  *****
02/12/2022 07:11:40 - INFO - __main__ -     perplexity = tensor(6.3723)


{'perplexity_': tensor(6.3723)}

## Loading the Trained Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('output-small')



In [None]:
# Let's chat for 4 lines
for step in range(4):
    # encoding the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    

    # appending the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generating a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature=0.8
    )
    
    
    print("Chandler: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Hello!
Chandler: Hey!
>> User:How's Joey?
Chandler: Oh, he's doing great. He's just... he's... he... he seems so happy.
>> User:What's a moo point?
Chandler: You know, moo-point.
>> User:How are things with Monica?
Chandler: !!!!!!(to Chandler) That was you?!


## Pushing Model to Hugging Face

In [None]:
os.chdir('/content/')

In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following packages were automatically installed and are no longer required:
  cuda-command-line-tools-10-0 cuda-command-line-tools-10-1
  cuda-command-line-tools-11-0 cuda-compiler-10-0 cuda-compiler-10-1
  cuda-compiler-11-0 cuda-cuobjdump-10-0 cuda-cuobjdump-10-1
  cuda-cuobjdump-11-0 cuda-cupti-10-0 cuda-cupti-10-1 cuda-cupti-11-0
  cuda-cupti-dev-11-0 cuda-documentation-10-0 cuda-documentation-10-1
  cuda-documentation-11-0 cuda-documentation-11-1 cuda-gdb-10-0 cuda-gdb-10-1
  cuda-gdb-11-0 cuda-gpu-library-advisor-10-0 cuda-gpu-library-advisor-10-1
  cuda-libraries-10-0 cuda-libraries-10-1 cuda-libraries-11-0
  cuda-memcheck-10-0 cuda-memcheck-10-1 cuda-memcheck-11-0 cuda-nsight-10-0
  cuda-nsight-10-1 cuda-nsight-11-0 cuda-nsight-11-1 cuda-nsight-compute-10-0
  cuda-nsight-compute-10-1 cuda-nsight-compute-11-0 cuda-nsight-compute-11-1
  cuda-nsight-systems-10-1 cuda-nsight-systems-

In [None]:
!pip install huggingface_hub



In [None]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/token.
        (Deprecated, will be removed in v0.3.0) To login with username and password instead, interrupt with Ctrl+C.
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on you

In [None]:
!huggingface-cli repo create DialoGPT-small-Chandler

[90mgit version 2.17.1[0m
Error: unknown flag: --version

[90mSorry, no usage text found for "git-lfs"[0m

You are about to create [1manweasha/DialoGPT-small-Chandler[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/anweasha/DialoGPT-small-Chandler[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/anweasha/DialoGPT-small-Chandler



In [None]:
!cat /root/.huggingface/token

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

In [None]:
!git clone https://anweasha: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@huggingface.co/anweasha/DialoGPT-small-Chandler

Cloning into 'DialoGPT-small-Chandler'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (3/3), done.


In [None]:
!mv /content/drive/My\ Drive/output-small/* DialoGPT-small-Chandler/

In [None]:
os.chdir('DialoGPT-small-Chandler')

In [None]:
!git lfs install

Updated git hooks.
Git LFS initialized.


In [None]:
!ls

checkpoint-3500   merges.txt		   tokenizer_config.json  vocab.json
config.json	  pytorch_model.bin	   tokenizer.json
eval_results.txt  special_tokens_map.json  training_args.bin


In [None]:
!pwd

/content/DialoGPT-small-Chandler


In [None]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mcheckpoint-3500/[m
	[31mconfig.json[m
	[31meval_results.txt[m
	[31mmerges.txt[m
	[31mpytorch_model.bin[m
	[31mspecial_tokens_map.json[m
	[31mtokenizer.json[m
	[31mtokenizer_config.json[m
	[31mtraining_args.bin[m
	[31mvocab.json[m

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
!git add .

In [None]:
!git config --global user.email "anweashasaha2000@gmail.com"

!git config --global user.name "anweasha"

In [None]:
!git commit -m "Initial commit"

[main 2da0637] Initial commit
 19 files changed, 300705 insertions(+)
 create mode 100644 checkpoint-3500/config.json
 create mode 100644 checkpoint-3500/merges.txt
 create mode 100644 checkpoint-3500/optimizer.pt
 create mode 100644 checkpoint-3500/pytorch_model.bin
 create mode 100644 checkpoint-3500/scheduler.pt
 create mode 100644 checkpoint-3500/special_tokens_map.json
 create mode 100644 checkpoint-3500/tokenizer.json
 create mode 100644 checkpoint-3500/tokenizer_config.json
 create mode 100644 checkpoint-3500/training_args.bin
 create mode 100644 checkpoint-3500/vocab.json
 create mode 100644 config.json
 create mode 100644 eval_results.txt
 create mode 100644 merges.txt
 create mode 100644 pytorch_model.bin
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json
 create mode 100644 training_args.bin
 create mode 100644 vocab.json


In [None]:
!git push

Git LFS: (5 of 5 files) 1.88 GB / 1.88 GB
Counting objects: 15, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (14/14), done.
Writing objects: 100% (15/15), 1.09 MiB | 2.22 MiB/s, done.
Total 15 (delta 2), reused 0 (delta 0)
To https://huggingface.co/anweasha/DialoGPT-small-Chandler
   6c4d90b..2da0637  main -> main
