In [1]:
import re
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from contractions import contractions_dict 
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import torch.optim as optim
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup



In [2]:
# 1. Model Definition & set padding
model_name = 'microsoft/dialogpt-small'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [3]:
MOVIE_LINES_PATH = '/Users/aanderson/Downloads/corpus/movie_lines.txt'
MOVIE_CONVERSATIONS_PATH = '/Users/aanderson/Downloads/corpus/movie_conversations.txt'
ID_REGEX = re.compile('L[0-9]+')

In [4]:
# Read movie lines
all_lines = {}
with open(MOVIE_LINES_PATH, encoding='iso-8859-1', errors='ignore') as file:
    for line in file:
        parts = line.split(' +++$+++ ')
        line_data = {
            "lineID": parts[0],
            "characterID": parts[1],
            "movieID": parts[2],
            "character": parts[3],
            "text": parts[4].strip()  # strip text here
        }
        all_lines[line_data['lineID']] = line_data

# Process conversations to get pairs
pairs = []
with open(MOVIE_CONVERSATIONS_PATH, encoding='iso-8859-1', errors='ignore') as file:
    for line in file:
        parts = line.split(' +++$+++ ')
        utterance_ids = ID_REGEX.findall(parts[3])
        for i in range(len(utterance_ids) - 1):  # -1 to prevent IndexError in the next step
            question = all_lines[utterance_ids[i]]['text']
            answer = all_lines[utterance_ids[i + 1]]['text']
            if question and answer:
                pairs.append([question, answer])

In [5]:
print(f"Total pairs: {len(pairs)}")
for pair in pairs[:5]:
    print(pair)

Total pairs: 221282
['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."]
["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.']
['Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
["You're asking me out.  That's so cute. What's your name again?", 'Forget it.']
["No, no, it's my fault -- we didn't have a proper introduction ---", 'Cameron.']


In [6]:
# Reduce pairs to 1% of the total
testing_length = int(len(pairs) * 0.01)
pairs = pairs[:testing_length]
print(f"Total pairs after reduction: {len(pairs)}")

Total pairs after reduction: 2212


In [7]:
def preprocess_text(text):
    # Expand contractions
    for key, value in contractions_dict.items():
        text = text.replace(key, value)
    return text

pairs_cleaned = [[preprocess_text(q), preprocess_text(a)] for q, a in pairs]

In [8]:
class ChatDataset(Dataset):
    def __init__(self, tokenizer, pairs, max_length=1024):
        self.tokenizer = tokenizer
        self.pairs = pairs
        self.max_length = max_length
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        pair = self.pairs[idx]
        encoding = self.tokenizer(pair[0], pair[1], max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        
        # Extract input_ids and attention_mask from the encoding
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        targets = input_ids.clone()  # This can be modified based on how you wish to structure your targets
        
        return input_ids, attention_mask, targets

In [9]:
train_pairs, val_pairs = train_test_split(pairs_cleaned, test_size=0.1)

train_dataset = ChatDataset(tokenizer, train_pairs)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=40)

val_dataset = ChatDataset(tokenizer, val_pairs)
val_dataloader = DataLoader(val_dataset, batch_size=40)

In [10]:
EPOCHS = 20
LR = 5e-5

# Early stopping
best_val_loss = float('inf')
no_improve = 0
patience = 3

# Model
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
loss_fn = nn.CrossEntropyLoss()
model.to('cpu')



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    
    # Training
    for batch_num, (batch_inputs, batch_masks, batch_targets) in enumerate(train_dataloader):
        optimizer.zero_grad()
        
        batch_inputs, batch_masks, batch_targets = batch_inputs.to('cpu'), batch_masks.to('cpu'), batch_targets.to('cpu')
        
        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), batch_targets.view(-1))
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item() 
        
        if batch_num != 0 and batch_num % 30 == 0:  
            print(f"Epoch {epoch}, Batch {batch_num}, Loss: {loss.item()}")
            
    # Average loss
    avg_train_loss = total_loss / len(train_dataloader)
    scheduler.step(avg_train_loss)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch_inputs, batch_masks, batch_targets in val_dataloader:  # Unpacking three tensors
            batch_inputs, batch_masks, batch_targets = batch_inputs.to('cpu'), batch_masks.to('cpu'), batch_targets.to('cpu')
            
            outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)  # Using attention masks
            loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), batch_targets.view(-1))
            
            total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve = 0
    else:
        no_improve += 1
    if no_improve == patience:
        print("Early stopping!")
        break

In [None]:
def get_response(input_text, model, tokenizer, num_beams=5, max_length=50):
    
    model.eval()
    
    input_tensor = tokenizer.encode(input_text, return_tensors='pt', truncation=True, max_length=max_length).to('cpu')
    
    # Generate response using beam search
    output_ids = model.generate(
        input_tensor,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True
    )
    
    # Decode the output ids to text
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return response

In [None]:
input_query = "How are you?"
response = get_response(input_query, model, tokenizer)
print(response)