In [1]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import spacy
from contractions import contractions_dict 
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.optim as optim
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer



In [2]:
# 1. Model Definition & set padding
model_name = 'microsoft/dialogpt-small'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [3]:
movie_lines_path = '/Users/aanderson/Downloads/corpus/movie_lines.txt'
movie_conversations_path = '/Users/aanderson/Downloads/corpus/movie_conversations.txt'

In [4]:
lines = open(movie_lines_path, encoding='utf-8', errors='ignore').read().split('\n')
convers = open(movie_conversations_path, encoding='utf-8', errors='ignore').read().split('\n')

**Text Preprocessing**

In [5]:
class TextPreprocessor:
    def __init__(self, max_len=13):
        self.max_len = max_len
        self._compile_regex()

    def _compile_regex(self):
        # Contractions
        self.compiled_patterns = {re.compile(pattern): repl for pattern, repl in contractions_dict.items()}
        # Retain important punctuation
        self.clean_punctuations = re.compile(r'[^a-zA-Z0-9?.!,¿]')

    def clean_text(self, txt):
        txt = txt.lower()
        for pattern, repl in self.compiled_patterns.items():
            txt = pattern.sub(repl, txt)
        txt = self.clean_punctuations.sub(' ', txt)
        return txt.strip()

    def preprocess_data(self, convers, lines):
        exchange = [conver.split(' +++$+++ ')[-1][1:-1].replace("'", " ").replace(",", "").split() for conver in convers]
        diag = {line.split(' +++$+++ ')[0]: line.split(' +++$+++ ')[-1] for line in lines}
        questions, answers = self._extract_questions_answers(exchange, diag)
        return questions, answers

    def _extract_questions_answers(self, exchange, diag):
        questions, answers = [], []
        for conver in exchange:
            for i in range(len(conver) - 1):
                questions.append(diag.get(conver[i], ''))
                answers.append(diag.get(conver[i + 1], ''))
        sorted_ques = [q for q in questions if len(q.split()) < self.max_len]
        sorted_ans = [a for q, a in zip(questions, answers) if len(q.split()) < self.max_len]
        return sorted_ques, sorted_ans

In [6]:
max_len = 13
max_seq_len = 40

preprocessor = TextPreprocessor(max_len=max_len)
sorted_ques, sorted_ans = preprocessor.preprocess_data(convers, lines)
clean_ques = [preprocessor.clean_text(q) for q in sorted_ques]
clean_ans = [preprocessor.clean_text(a) for a in sorted_ans]

# Trimming answers and lists
clean_ans = [' '.join(ans.split()[:11]) for ans in clean_ans]
clean_ans = clean_ans[:1000]
clean_ques = clean_ques[:1000]

In [7]:
# Before cleaning
print(sorted_ques[:5])
print(sorted_ans[:5])

["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "You're asking me out.  That's so cute. What's your name again?", 'Cameron.', 'Why?']
['Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?", 'Forget it.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.", 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.']


In [8]:
# After cleaning
print(clean_ques[:5])
print(clean_ans[:5])

['well, i thought we would start with pronunciation, if that is okay with you.', 'not the hacking and gagging and spitting part.  please.', 'you are asking me out.  that is so cute. what is your name again?', 'cameron.', 'why?']
['not the hacking and gagging and spitting part. please.', 'okay... then how bout we try out some french cuisine. saturday?', 'forget it.', 'the thing is, cameron i m at the mercy of a', 'unsolved mystery. she used to be really popular when she started']


In [9]:
# Dataset and dataloader
class DialogDataset(Dataset):
    def __init__(self, clean_ques, clean_ans, tokenizer, max_length):
        self.clean_ques = clean_ques
        self.clean_ans = clean_ans
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.clean_ques)
    
    @staticmethod
    def collate_fn(batch):
        inputs, targets = zip(*batch)
        inputs = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
        targets = pad_sequence(targets, batch_first=True, padding_value=tokenizer.pad_token_id)
        return inputs, targets

    def __getitem__(self, idx):
        question = self.clean_ques[idx]
        answer = self.clean_ans[idx]

        question_tokenized = self.tokenizer(question, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        answer_tokenized = self.tokenizer(answer, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return question_tokenized['input_ids'].squeeze(), answer_tokenized['input_ids'].squeeze()

In [10]:
# Dataloader, split into train and val
BATCH_SIZE = 50

# Split
train_ques, val_ques, train_ans, val_ans = train_test_split(clean_ques, clean_ans, test_size=0.2, random_state=42)

# Dataset
train_dataset = DialogDataset(train_ques, train_ans, tokenizer, max_seq_len)
val_dataset = DialogDataset(val_ques, val_ans, tokenizer, max_seq_len)

# Dataloader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=DialogDataset.collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=DialogDataset.collate_fn)

In [11]:
EPOCHS = 20
LR = 5e-5

# Early stopping
best_val_loss = float('inf')
no_improve = 0
patience = 3  # for example

# Model
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
loss_fn = nn.CrossEntropyLoss()
model.to('cpu')



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    
    #Training
    for batch_num, (batch_inputs, batch_targets) in enumerate(train_dataloader):
        optimizer.zero_grad()
        
        batch_inputs, batch_targets = batch_inputs.to('cpu'), batch_targets.to('cpu')
        
        outputs = model(batch_inputs)
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), batch_targets.view(-1))
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item() 
        
        if batch_num != 0 and batch_num % 30 == 0:  
            print(f"Epoch {epoch}, Batch {batch_num}, Loss: {loss.item()}")

    avg_train_loss = total_loss / len(train_dataloader)  # compute the average loss for the epoch
    
    scheduler.step(avg_train_loss)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch_inputs, batch_targets in val_dataloader:
            batch_inputs, batch_targets = batch_inputs.to('cpu'), batch_targets.to('cpu')
            outputs = model(batch_inputs)
            loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), batch_targets.view(-1))
            total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve = 0
    else:
        no_improve += 1
    if no_improve == patience:
        print("Early stopping!")
        break

Epoch 0, Training Loss: 2.5974228978157043, Validation Loss: 1.9105373322963715
Epoch 1, Training Loss: 1.8802273869514465, Validation Loss: 1.668590784072876
Epoch 2, Training Loss: 1.7250807136297226, Validation Loss: 1.599843680858612
Epoch 3, Training Loss: 1.6536614745855331, Validation Loss: 1.5745927095413208
Epoch 4, Training Loss: 1.6098138317465782, Validation Loss: 1.5422749817371368
Epoch 5, Training Loss: 1.5674100741744041, Validation Loss: 1.5329861044883728


In [None]:
def get_response(input_text):
    model.eval()
    
    input_tensor = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=max_seq_len)['input_ids'].to('cpu')
    output_ids = model.generate(input_tensor, max_length=20, num_beams=5, temperature=0.7)
    
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Test
print(get_response("Hello"))