# Installing Requirements

In [None]:
# !pip install transformers

# Imports

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForMaskedLM


# Hyperparameters

In [2]:
MAX_LEN = 64
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
LEARNING_RATE = 1e-05
NUM_CLASSES = 6

# Processing data

## Creating a dataframe

In [3]:
df = pd.read_csv("../input/friends-dialogues/dialogues_cleaned.csv")
df = df.drop(df[df["person"]=="person"].index)

## Label Encoder

In [5]:
rachel_dlgs = df[df["person"]=="rachel"]["dialogue"].values
monica_dlgs = df[df["person"]=="monica"]["dialogue"].values
ross_dlgs = df[df["person"]=="ross"]["dialogue"].values

In [6]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', mask_toke="[MASK]", sep_token="[SEP]", pad_token="[PAD]")

def tokenize_batch(batch):
    return [tokenizer.convert_tokens_to_ids(sent) for sent in batch]

def untokenize_batch(batch):
    return [tokenizer.convert_ids_to_tokens(sent) for sent in batch]

def detokenize(sent):
    """ Roughly detokenizes (mainly undoes wordpiece) """
    new_sent = []
    for i, tok in enumerate(sent):
        if tok.startswith("##"):
            new_sent[len(new_sent) - 1] = new_sent[len(new_sent) - 1] + tok[2:]
        else:
            new_sent.append(tok)
    return new_sent

CLS = '[CLS]'
SEP = '[SEP]'
MASK = '[MASK]'
mask_id = tokenizer.convert_tokens_to_ids([MASK])[0]
sep_id = tokenizer.convert_tokens_to_ids([SEP])[0]
cls_id = tokenizer.convert_tokens_to_ids([CLS])[0]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [8]:
class Dataset(torch.utils.data.Dataset):    
    def __init__(self, x):          
        self.x = x
     
    def __getitem__(self, idx):
        return tokenizer(self.x[idx])["input_ids"]
        
    def __len__(self):
        return len(self.x)

In [9]:
rachel_ds = Dataset(rachel_dlgs.tolist())
monica_ds = Dataset(monica_dlgs.tolist())
ross_ds = Dataset(ross_dlgs.tolist())


# Model

In [10]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=-1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    # recall = recall_score(y_true=labels, y_pred=pred)
    # precision = precision_score(y_true=labels, y_pred=pred)
    # f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy} 

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=TRAIN_BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=VALID_BATCH_SIZE,   # batch size for evaluation
    evaluation_strategy="epoch",
    report_to=None
)


In [16]:
rachel_trainer = Trainer(
    model=model,                 # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=rachel_ds,         # training dataset
    eval_dataset=rachel_ds[:10],
#     compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [17]:
import math
import time

def generate_step(out, gen_idx, top_k=0, sample=False, return_list=True):
    """ Generate a word from from out[gen_idx]
    
    args:
        - out (torch.Tensor): tensor of logits of size batch_size x seq_len x vocab_size
        - gen_idx (int): location for which to generate for
        - top_k (int): if >0, only sample from the top k most probable words
        - sample (Bool): if True, sample from full distribution. Overridden by top_k 
    """
    # print("g", out["logits"].shape)
    logits = out["logits"][:, gen_idx]

    if top_k > 0:
        kth_vals, kth_idx = logits.topk(top_k, dim=-1)
        dist = torch.distributions.categorical.Categorical(logits=kth_vals)
        idx = kth_idx.gather(dim=1, index=dist.sample().unsqueeze(-1)).squeeze(-1)
    elif sample:
        dist = torch.distributions.categorical.Categorical(logits=logits)
        idx = dist.sample().squeeze(-1)
    else:
        idx = torch.argmax(logits, dim=-1)
    return idx.tolist() if return_list else idx
  
  
def get_init_text(seed_text, max_len, batch_size = 1, rand_init=False):
    """ Get initial sentence by padding seed_text with either masks or random words to max_len """
    batch = [seed_text + [MASK] * max_len + [SEP] for _ in range(batch_size)]
    return tokenize_batch(batch)

def printer(sent, should_detokenize=True):
    if should_detokenize:
        sent = detokenize(sent)[1:-1]
    # print(" ".join(sent))


def generate(n_samples, seed_text="[CLS]", batch_size=10, max_len=15, leed_out_len=15,
             sample=True, top_k=100, temperature=1.0, burnin=200, max_iter=500, print_every=1):
    sentences = []
    n_batches = math.ceil(n_samples / batch_size)
    start_time = time.time()
    seed_len = len(seed_text)
    batch = get_init_text(seed_text, max_len, batch_size)
    
    for ii in range(max_len):
        inp = [sent[:seed_len+ii+leed_out_len]+[sep_id] for sent in batch]
        inp = torch.tensor(batch).cuda()
#         torch.tensor(batch)
        out = model(inp)
        # print(seed_len, ii, out.keys())
        idxs = generate_step(out, gen_idx=seed_len+ii, top_k=top_k, sample=sample)
        for jj in range(batch_size):
            batch[jj][seed_len+ii] = idxs[jj]
        
    return untokenize_batch(batch)


In [18]:
history = trainer.train()

***** Running training *****
  Num examples = 1538
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,5.810031
2,No log,6.00104
3,No log,4.507802
4,No log,3.436594
5,No log,4.500379
6,No log,4.900298
7,No log,3.950582
8,No log,3.790515
9,No log,4.436484
10,No log,4.807736


***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




In [19]:
n_samples = 1
batch_size = 5
max_len = 40
top_k = 100
temperature = 1.0
leed_out_len = 5 # max_len
burnin = 250
sample = True
max_iter = 500

# Choose the prefix context
seed_text = "[CLS]".split()
rachel_res = generate(n_samples, seed_text=seed_text, batch_size=batch_size, max_len=max_len,
                      sample=sample, top_k=top_k, temperature=temperature, burnin=burnin, max_iter=max_iter)

In [26]:
print("Generated Rachel's dialogues:")
for sent in rachel_res:
    print(' '.join(sent))

Generated Rachel's dialogues:
[CLS] why don ##t get good form right well said thought said yes knew ross liked conversation ross said say right right said yes said right wrong okay calm calm listen listen listen clearly said listen well listen hear talking conversation [SEP]
[CLS] listen can ##t still getting word back rachel tell ross tell rachel tell ask knew rachel know rachel tell ross tell rachel tell ross went play phone phone rachel saw emma know tell rachel got phone rachel gave back phone [SEP]
[CLS] guys love you guys think chandler ##s scene mean god honey know chandler ##s phoebe hope phoebe even rachel phoebe feel like phoebe ross want know great phoebe think chandler ##s sex god god could help phoebe make love god [SEP]
[CLS] know guys ##t know girls know know play god great much joey ##s emma sandy wan ##s baby joey makes right time maybe joey tries use word anything saying emma make thing something emma feel emma go ##n think something [SEP]
[CLS] man joey what ##s gett

In [27]:
model.save_pretrained("/weights/rachel/")

Configuration saved in /weights/rachel/config.json
Model weights saved in /weights/rachel/pytorch_model.bin
