In [21]:
import re
import torch
import spacy
import torch.nn as nn
import torch.optim as optim
from contractions import contractions_dict
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

In [43]:
#!pip install contractions
#!pip install transformers

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
device = torch.device("cuda")

In [25]:
# 1. Model Definition & set padding
model_name = 'microsoft/dialogpt-medium'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

In [26]:
movie_lines_path = '/content/drive/MyDrive/AAI510/movie_lines.txt'
movie_conversations_path = '/content/drive/MyDrive/AAI510/movie_conversations.txt'

In [27]:
lines = open(movie_lines_path, encoding='utf-8', errors='ignore').read().split('\n')
convers = open(movie_conversations_path, encoding='utf-8', errors='ignore').read().split('\n')

**Text Preprocessing**

In [28]:
class TextPreprocessor:
    def __init__(self, max_len=13):
        self.max_len = max_len
        self._compile_regex()

    def _compile_regex(self):
        # Contractions
        self.compiled_patterns = {re.compile(pattern): repl for pattern, repl in contractions_dict.items()}
        # Retain important punctuation
        self.clean_punctuations = re.compile(r'[^a-zA-Z0-9?.!,¿]')

    def clean_text(self, txt):
        txt = txt.lower()
        for pattern, repl in self.compiled_patterns.items():
            txt = pattern.sub(repl, txt)
        txt = self.clean_punctuations.sub(' ', txt)
        return txt.strip()

    def preprocess_data(self, convers, lines):
        exchange = [conver.split(' +++$+++ ')[-1][1:-1].replace("'", " ").replace(",", "").split() for conver in convers]
        diag = {line.split(' +++$+++ ')[0]: line.split(' +++$+++ ')[-1] for line in lines}
        questions, answers = self._extract_questions_answers(exchange, diag)
        return questions, answers

    def _extract_questions_answers(self, exchange, diag):
        questions, answers = [], []
        for conver in exchange:
            for i in range(len(conver) - 1):
                questions.append(diag.get(conver[i], ''))
                answers.append(diag.get(conver[i + 1], ''))
        sorted_ques = [q for q in questions if len(q.split()) < self.max_len]
        sorted_ans = [a for q, a in zip(questions, answers) if len(q.split()) < self.max_len]
        return sorted_ques, sorted_ans

In [29]:
max_len = 30
max_seq_len = 60

preprocessor = TextPreprocessor(max_len=max_len)
sorted_ques, sorted_ans = preprocessor.preprocess_data(convers, lines)
clean_ques = [preprocessor.clean_text(q) for q in sorted_ques]
clean_ans = [preprocessor.clean_text(a) for a in sorted_ans]

# Trimming answers and lists
clean_ans = [' '.join(ans.split()[:max_len - 2]) for ans in clean_ans]
clean_ans = clean_ans[:3000]
clean_ques = clean_ques[:3000]

In [46]:
# Before cleaning
print(sorted_ques[:5])

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "You're asking me out.  That's so cute. What's your name again?", "No, no, it's my fault -- we didn't have a proper introduction ---"]


In [47]:
print(sorted_ans[:5])

["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?", 'Forget it.', 'Cameron.']


In [31]:
# After cleaning
print(clean_ques[:5])
print(clean_ans[:5])

['can we make this quick?  roxanne korrine and andrew barrett are having an incredibly horrendous public break  up on the quad.  again.', 'well, i thought we would start with pronunciation, if that is okay with you.', 'not the hacking and gagging and spitting part.  please.', 'you are asking me out.  that is so cute. what is your name again?', 'no, no, it is my fault    we did not have a proper introduction']
['well, i thought we would start with pronunciation, if that is okay with you.', 'not the hacking and gagging and spitting part. please.', 'okay... then how bout we try out some french cuisine. saturday? night?', 'forget it.', 'cameron.']


In [32]:
class DialogDataset(Dataset):
    def __init__(self, clean_ques, clean_ans, tokenizer, max_length):
        self.clean_ques = clean_ques
        self.clean_ans = clean_ans
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.clean_ques)

    @staticmethod
    def collate_fn(batch):
        inputs, targets, input_masks, target_masks, input_token_type_ids, target_token_type_ids = zip(*batch)
        inputs = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
        targets = pad_sequence(targets, batch_first=True, padding_value=tokenizer.pad_token_id)
        input_masks = pad_sequence(input_masks, batch_first=True, padding_value=0)
        target_masks = pad_sequence(target_masks, batch_first=True, padding_value=0)
        input_token_type_ids = pad_sequence(input_token_type_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
        target_token_type_ids = pad_sequence(target_token_type_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
        return inputs, targets, input_masks, target_masks, input_token_type_ids, target_token_type_ids

    def __getitem__(self, idx):
        question = self.clean_ques[idx]
        answer = self.clean_ans[idx]

        question_tokenized = self.tokenizer(question, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt', return_token_type_ids=True)
        answer_tokenized = self.tokenizer(answer, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt', return_token_type_ids=True)

        return (
            question_tokenized['input_ids'].squeeze(),
            answer_tokenized['input_ids'].squeeze(),
            question_tokenized['attention_mask'].squeeze(),
            answer_tokenized['attention_mask'].squeeze(),
            question_tokenized.get('token_type_ids', torch.tensor([])).squeeze(),  # Return empty tensor if not present
            answer_tokenized.get('token_type_ids', torch.tensor([])).squeeze()
        )

In [50]:
# Create an instance of the DialogDataset class
dataset = DialogDataset(clean_ques, clean_ans, tokenizer, max_seq_len)

# Create a DataLoader
batch_size = 32  # You can adjust the batch size as needed
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=DialogDataset.collate_fn)

# Get a sample of 5 questions and answers pairs
sample_size = 5
sample_questions = []
sample_answers = []

for batch in dataloader:
    inputs, targets, _, _, _, _ = batch
    for i in range(sample_size):
        question = tokenizer.decode(inputs[i], skip_special_tokens=True)
        answer = tokenizer.decode(targets[i], skip_special_tokens=True)
        sample_questions.append(question)
        sample_answers.append(answer)
    if len(sample_questions) >= sample_size:
        break

# Print the sample questions and answers
for i in range(sample_size):
    print(f"Question {i+1}: {sample_questions[i]}")
    print(f"Answer {i+1}: {sample_answers[i]}\n")


Question 1: hey.  i m a professional.
Answer 1: women like that have a way of turning professionals into amateurs.

Question 2: he say anything about the summons i tried to give him? sonofabitch would not accept it.
Answer 2: wade, that was not smarch. going out right after the man s father in law shot himself. let it go. call it a favor to me.

Question 3: how did you get a tux at the last minute?
Answer 3: it is scurvy s. his date got convicted. where did you get the dress?

Question 4: how are you feeling, fernando?
Answer 4: not bad.

Question 5: too bad about frank, is not it?
Answer 5: yes, it is.



In [33]:
# Dataloader, split into train and val
BATCH_SIZE = 80

# Split
train_ques, val_ques, train_ans, val_ans = train_test_split(clean_ques, clean_ans, test_size=0.2, random_state=42)

# Dataset
train_dataset = DialogDataset(train_ques, train_ans, tokenizer, max_seq_len)
val_dataset = DialogDataset(val_ques, val_ans, tokenizer, max_seq_len)

# Dataloader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=DialogDataset.collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=DialogDataset.collate_fn)

In [34]:
EPOCHS = 20
LR = 5e-5

# Early stopping
best_val_loss = float('inf')
no_improve = 0
patience = 4

# Model
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
loss_fn = nn.CrossEntropyLoss()
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [35]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    #Training
    for batch_num, (batch_inputs, batch_targets, input_masks, target_masks, input_token_type_ids, target_token_type_ids) in enumerate(train_dataloader):
        optimizer.zero_grad()

        batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)
        input_masks = input_masks.to(device)

        outputs = model(batch_inputs, attention_mask=input_masks)
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), batch_targets.view(-1))

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

        if batch_num != 0 and batch_num % 30 == 0:
            print(f"Epoch {epoch}, Batch {batch_num}, Loss: {loss.item()}")

    # compute the average loss for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    scheduler.step(avg_train_loss)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch_inputs, batch_targets, _, val_input_masks, _, _ in val_dataloader:
            batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)
            val_input_masks = val_input_masks.to(device)

            outputs = model(batch_inputs, attention_mask=val_input_masks)
            loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), batch_targets.view(-1))
            total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve = 0
    else:
        no_improve += 1
    if no_improve == patience:
        print("Early stopping!")
        break

Epoch 0, Training Loss: 2.000652964909871, Validation Loss: 1.7533817887306213
Epoch 1, Training Loss: 1.8127119898796082, Validation Loss: 2.0494414567947388
Epoch 2, Training Loss: 1.8477030952771505, Validation Loss: 1.681943342089653
Epoch 3, Training Loss: 1.7290360450744628, Validation Loss: 1.616935595870018
Epoch 4, Training Loss: 1.7052722016970316, Validation Loss: 1.7933958619832993
Epoch 5, Training Loss: 2.146157467365265, Validation Loss: 2.0902405083179474
Epoch 00007: reducing learning rate of group 0 to 5.0000e-06.
Epoch 6, Training Loss: 1.780916945139567, Validation Loss: 1.7805142849683762
Epoch 7, Training Loss: 1.6827802220980326, Validation Loss: 1.7615855485200882
Early stopping!


In [36]:
def post_process(response_text):
    response_text = re.sub(' +', ' ', response_text)
    response_text = response_text.strip()

    # Truncate if the response is too long
    if len(response_text.split()) > 100:
        response_text = ' '.join(response_text.split()[:100]) + "..."

    # Punctuation
    if not response_text[-1] in ['.', '!', '?']:
        response_text += '.'

    return response_text

In [37]:
def get_response(input_text, model, tokenizer, num_beams=7, max_length=80, temperature=0.5, top_k=50, no_repeat_ngram_size=2):
    model.eval()

    encoded_input = tokenizer.encode(input_text, return_tensors='pt', truncation=True)
    input_length = encoded_input.shape[1]

    if input_length + 10 > max_length:
        max_length = input_length + 10

    input_tensor = encoded_input.to(device)
    attention_mask = torch.ones(input_tensor.shape).to(device)

    # Beam search
    output_ids = model.generate(
        input_tensor,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id
        temperature=temperature,
        top_k=top_k,
        no_repeat_ngram_size=no_repeat_ngram_size
    )


    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    response = post_process(response)

    return response

In [38]:
input_query = "Hello"
response = get_response(input_query, model, tokenizer)
print(response)

Hello in from the other side.


In [55]:
import nltk
from rouge_score import rouge_scorer

# Ensure you have downloaded the NLTK data
nltk.download("punkt")

# Reference responses (ground truth)
reference_responses = [
    "women like that have a way of turning professionals into amateurs.",
    "wade, that was not smarch. going out right after the man s father in law shot himself. let it go. call it a favor to me.",
    "it is scurvy s. his date got convicted. where did you get the dress?",
    "not bad.",
    "yes, it is."
]

# Clean the reference responses using your TextPreprocessor
preprocessor = TextPreprocessor(max_len=30)  # Use the same max_len value as in your model preprocessing
reference_responses = [preprocessor.clean_text(response) for response in reference_responses]

# Model-generated responses
input_queries = [
    "hey. i m a professional.",
    "he say anything about the summons i tried to give him? sonofabitch would not accept it.",
    "how did you get a tux at the last minute?",
    "how are you feeling, fernando?",
    "too bad about frank, is not it?"
]

model_responses = []

for query in input_queries:
    response = get_response(query, model, tokenizer)  # Replace 'model' and 'tokenizer' with your actual model and tokenizer
    model_responses.append(response)

# Clean the model-generated responses using your TextPreprocessor
model_responses = [preprocessor.clean_text(response) for response in model_responses]

# BLEU Calculation
from nltk.translate.bleu_score import sentence_bleu

bleu_scores = [sentence_bleu([ref.split()], model.split()) for ref, model in zip(reference_responses, model_responses)]
average_bleu = sum(bleu_scores) / len(bleu_scores)

# ROUGE Calculation
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for ref, model in zip(reference_responses, model_responses):
    scores = scorer.score(ref, model)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)

print("Average BLEU Score:", average_bleu)
print("Average ROUGE-1 Score:", average_rouge1)
print("Average ROUGE-2 Score:", average_rouge2)
print("Average ROUGE-L Score:", average_rougeL)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Average BLEU Score: 5.3160331857435324e-79
Average ROUGE-1 Score: 0.22476190476190477
Average ROUGE-2 Score: 0.03636363636363637
Average ROUGE-L Score: 0.1657142857142857
