# Load the data files from the Cornell Movie Dialog Corpus

 https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Specify the path to the dataset files (adjust if needed)
data_dir = r'/content/drive/MyDrive/Colab Notebooks/archive'

# Files we need from the corpus
lines_file = os.path.join(data_dir, 'movie_lines.txt')
conversations_file = os.path.join(data_dir, 'movie_conversations.txt')

# Check if the files exist
if os.path.exists(lines_file) and os.path.exists(conversations_file):
    print("Dataset files loaded successfully.")
else:
    print("Dataset files are missing. Please download and provide the correct paths.")

Dataset files loaded successfully.


# Read the lines from the movies

From README.txt

movie_lines.txt
	- contains the actual text of each utterance
	- fields:
		- lineID
		- characterID (who uttered this phrase)
		- movieID
		- character name
		- text of the utterance

Example:
L868 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ The "real you".

In [3]:
# Load the movie lines from the movie_lines file and create a dictionary
def load_lines(file_path):
    # Dictionary of Lines with Line ID and it's corresponding text
    lines = {}

    # with open(file_path, 'r', encoding='iso-8859-1') as f:
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            # Movie lines are in this format: LineID + Speaker + MovieID + Character + Text
            parts = line.strip().split(" +++$+++ ")
            # Perform an explicit check to ensure that the length of parts is exactly 5 before mapping the movie lines.
            # Avoid issues if any lines are malformed.
            if len(parts) == 5:
                line_id = parts[0]     #Line Id
                text = parts[4]        # Movie Line
                lines[line_id] = text  #Populate dictionary.
    return lines

# Load movie lines
lines = load_lines(lines_file)
print(f"Loaded {len(lines)} lines from the dataset.")



Loaded 304446 lines from the dataset.


In [4]:
# Access the first element using list(lines.keys())[0] or lines.get(list(lines.keys())[0])
# list(lines.keys())[0] will retrieve the first key.

first_line_id = list(lines.keys())[0]
print(f"The first line in the dataset is: {lines[first_line_id]}")
print(f"The first line in the dataset is: {lines.get(first_line_id)}")

# Iterate over the first 15 items
for i, (line_id, text) in enumerate(lines.items()):
    if i < 15:
        print(f"Line ID: {line_id}, Text: {text}")
    else:
        break  # Stop after 15 iterations

The first line in the dataset is: They do not!
The first line in the dataset is: They do not!
Line ID: L1045, Text: They do not!
Line ID: L1044, Text: They do to!
Line ID: L985, Text: I hope so.
Line ID: L984, Text: She okay?
Line ID: L925, Text: Let's go.
Line ID: L924, Text: Wow
Line ID: L872, Text: Okay -- you're gonna need to learn how to lie.
Line ID: L871, Text: No
Line ID: L870, Text: I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
Line ID: L869, Text: Like my fear of wearing pastels?
Line ID: L868, Text: The "real you".
Line ID: L867, Text: What good stuff?
Line ID: L866, Text: I figured you'd get to the good stuff eventually.
Line ID: L865, Text: Thank God!  If I had to hear one more story about your coiffure...
Line ID: L864, Text: Me.  This endless ...blonde babble. I'm like, boring myself.


# Read the conversations from the movies

    Check to see if these conversations' lines are read within movie_lines.
    If so, keep it.
  
- movie_conversations.txt
	- the structure of the conversations
	- fields
		- characterID of the first character involved in the conversation
		- characterID of the second character involved in the conversation
		- movieID of the movie in which the conversation occurred
		- list of the utterances that make the conversation, in chronological
			order: ['lineID1','lineID2',É,'lineIDN']
			has to be matched with movie_lines.txt to reconstruct the actual content

Example:
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L404', 'L405', 'L406', 'L407']

In [5]:
#Load the movie conversations
def load_conversations(file_path, lines):
    conversations = []

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            # Movie conversations are in this format: Character1 + Character2 + MovieID + List of LineIDs
            parts = line.strip().split(" +++$+++ ")
            #retrieves line id's
            if len(parts) == 4:
                # Extract the list of line IDs and convert to text
                line_ids = eval(parts[3])  # eval to convert the string list to a list object
                conv = [lines[line_id] for line_id in line_ids if line_id in lines] # if line id matches entry in lines dictionary, keep it
                conversations.append(conv)
    return conversations

# Load conversations
conversations = load_conversations(conversations_file, lines)
print(f"Loaded {len(conversations)} conversations.")

#print(conversations)

Loaded 83097 conversations.


In [6]:
# Iterate over the first 15 items
for i, (text) in enumerate(conversations):
    if i < 15:
        print(f" Text: {text}")
    else:
        break  # Stop after 15 iterations



 Text: ['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
 Text: ["You're asking me out.  That's so cute. What's your name again?", 'Forget it.']
 Text: ["No, no, it's my fault -- we didn't have a proper introduction ---", 'Cameron.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.", 'Seems like she could get a date easy enough...']
 Text: ['Why?', 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.', "That's a shame."]
 Text: ['Gosh, if only we could find Kat a boyfriend...', 'Let me see what I can do.']
 Text: ["C'esc ma tete. This 

# Exploratory Data analysis
# Text Cleaning

In [7]:
!pip install nltk contractions emoji transformers datasets Counter evaluate Rouge  imbalanced-learn torch

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting Counter
  Downloading Counter-1.0.0.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting Rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metada

In [8]:
import re
import nltk
import contractions # Removing contractions
import emoji # Convert Emoticons to Text if you want to perform sentiment analysis.
import unicodedata

nltk.download('words')
words = set(nltk.corpus.words.words())

# Normalize string by converting unicode characters to ASCII and removing non-letters
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

# Function to convert emojis to words using emoji library mapping
def convert_emojis_to_words(text):
    converted_text = emoji.demojize(text)
    return converted_text

def remove_email(text):
    return re.sub(r'([a-z0-9+._-]+@[a-z0-9+_-]+)', "", text)

def remove_emojis(text):
    # Regular expression pattern to match emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Clitics
def clean_clitics(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    #text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

def remove_digitsInText(text):
    textWithoutDigits = list(filter(lambda x: x.isalpha(), text))
    return textWithoutDigits

def clean_text(text):
    # Step 1: Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    #Step 2: Remove emails
    text = remove_email(text)

    # Step 3: Remove Hashtags
    text = re.sub(r'#\w+', '', text)

    # Step 4: Remove Usernames (assuming they start with '@')
    text = re.sub(r'@\w+', '', text)

    # Step 5: Convert emojis to words
    text = convert_emojis_to_words(text)

    #Step 6: Remove any emoticons or emojis
    text = remove_emojis(text)

    # Step 7: Normalize Unicode characters to ASCII
    text = unicode_to_ascii(text.lower().strip())

    # Step 10: Convert to lowercase
    text = text.lower()

    # Step 11: Handle contractions / clitics
    text = contractions.fix(text)
    text = clean_clitics(text)

    # Step 8: Remove punctuation, numbers, and extra spaces
    # Step 9: Remove any special characters
    # Remove special characters but retain basic punctuation
    text = re.sub(r"[^A-Za-z0-9\s\.,!?]", "", text)  # Retain . , ! ?
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    #text = re.sub(r"[^\w\s]", '', text) #Remove special characters
    #text = remove_digitsInText(text) # Remove digits in text , Remove any words with digits like 5pm

    # Step 12: Remove non-English words
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

    return text

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [9]:
# Load and clean the data
def clean_and_prepare_data(lines_file, conversations_file):

    questions = []
    answers = []
    for conversation in conversations:
        for i in range(len(conversation) - 1):
            questions.append(conversation[i])
            answers.append(conversation[i + 1])

    clean_questions = [clean_text(q) for q in questions]
    clean_answers = [clean_text(a) for a in answers]

    # Filtering out short or long questions/answers
    # Keep only those sentences that have between 2 and 25 words.
    filtered_questions, filtered_answers = [], []

    for q, a in zip(clean_questions, clean_answers):
        # Filter out sentences that are too short or too long
        # Append the <EOS> (End of String) token to the end of each answer, indicating the end of the response for the model.
        if 2 <= len(q.split()) <= 25 and 2 <= len(a.split()) <= 25:
            filtered_questions.append(q)
            filtered_answers.append(a + ' <EOS>')  # Append end token to answers

    # Print the first 10 filtered questions and answers
    print("\nFirst 10 Filtered Questions and Answers:")
    for i in range(min(10, len(filtered_questions))):  # Ensure we don't exceed the length of the lists
        print(f"Q{i+1}: {filtered_questions[i]}")
        print(f"A{i+1}: {filtered_answers[i]}")
        print()


    return filtered_questions, filtered_answers

In [10]:
# Load and clean the data
clean_questions, clean_answers = clean_and_prepare_data(lines_file, conversations_file)

# Print out the number of cleaned questions and answers
print(f"Number of cleaned questions: {len(clean_questions)}")
print(f"Number of cleaned answers: {len(clean_answers)}")

# Print the first 10 filtered questions and answers
print("\nFirst 10 Filtered Questions and Answers:")
for i in range(min(10, len(clean_questions))):  # Ensure we don't exceed the length of the lists
    print(f"Q{i+1}: {clean_questions[i]}")
    print(f"A{i+1}: {clean_answers[i]}")
    print()


First 10 Filtered Questions and Answers:
Q1: can we make this quick ? and are an incredibly horrendous public break up on the quad . again .
A1: well , i thought we would start with pronunciation , if that is with you . <EOS>

Q2: well , i thought we would start with pronunciation , if that is with you .
A2: not the hacking and and spitting part . please . <EOS>

Q3: not the hacking and and spitting part . please .
A3: ... then how bout we try out some cuisine . ? night ? <EOS>

Q4: you are me out . that is so cute . what is your name again ?
A4: forget it . <EOS>

Q5: gosh , if only we could find kat a ...
A5: let me see what i can do . <EOS>

Q6: ma tete . this is my head
A6: right . see ? you are ready for the quiz . <EOS>

Q7: that is because it is such a nice one .
A7: forget . <EOS>

Q8: how is our little find the wench a date plan ?
A8: well , there is someone i think might be <EOS>

Q9: there .
A9: where ? <EOS>

Q10: you got something on your mind ?
A10: i on you to help my b

In [11]:
# Validate the questions and answers
def validate_questions_answers(questions, answers, min_len=2, max_len=25):
    valid_questions = []
    valid_answers = []
    invalid_count = 0

    for question, answer in zip(questions, answers):
        # Check for non-empty strings and length constraints
        if isinstance(question, str) and isinstance(answer, str) and min_len <= len(question.split()) <= max_len and min_len <= len(answer.split()) <= max_len:
            valid_questions.append(question)
            valid_answers.append(answer)
        else:
            invalid_count += 1  # Count invalid pairs

    print(f"Total valid questions: {len(valid_questions)}")
    print(f"Total valid answers: {len(valid_answers)}")
    print(f"Total invalid pairs: {invalid_count}")

    return valid_questions, valid_answers

# Run validation
validated_questions, validated_answers = validate_questions_answers(clean_questions, clean_answers)

Total valid questions: 168457
Total valid answers: 168457
Total invalid pairs: 1757


In [12]:
# Print the first 10 filtered questions and answers
print("\nFirst 10 Filtered Questions and Answers:")
for i in range(min(10, len(validated_questions))):  # Ensure we don't exceed the length of the lists
    print(f"Q{i+1}: {validated_questions[i]}")
    print(f"A{i+1}: {validated_answers[i]}")
    print()


First 10 Filtered Questions and Answers:
Q1: can we make this quick ? and are an incredibly horrendous public break up on the quad . again .
A1: well , i thought we would start with pronunciation , if that is with you . <EOS>

Q2: well , i thought we would start with pronunciation , if that is with you .
A2: not the hacking and and spitting part . please . <EOS>

Q3: not the hacking and and spitting part . please .
A3: ... then how bout we try out some cuisine . ? night ? <EOS>

Q4: you are me out . that is so cute . what is your name again ?
A4: forget it . <EOS>

Q5: gosh , if only we could find kat a ...
A5: let me see what i can do . <EOS>

Q6: ma tete . this is my head
A6: right . see ? you are ready for the quiz . <EOS>

Q7: that is because it is such a nice one .
A7: forget . <EOS>

Q8: how is our little find the wench a date plan ?
A8: well , there is someone i think might be <EOS>

Q9: there .
A9: where ? <EOS>

Q10: you have my word . as a gentleman
A10: you are sweet . <EOS

# Train, Test Split before training using Cornell Movie corpus data

Train Set: Use this to train the model.

Validation Set: Use this to fine tune the models hyperparameter and evaluate the model during training. Monitor overfitting during training.

Test Set: Use this post training to test how well the model generalizes or to see how the model performs on unseen data.


In [13]:
from sklearn.model_selection import train_test_split

# Split data into train, validation, and test sets
def split_data(clean_questions, clean_answers, test_size=0.2, val_size=0.1):
    # First split into train and remaining (which will be split further)
    questions_train, questions_rem, answers_train, answers_rem = train_test_split(
        clean_questions, clean_answers, test_size=(test_size + val_size), random_state=42)

    # Then split the remaining into validation and test
    val_size_adjusted = val_size / (test_size + val_size)
    questions_val, questions_test, answers_val, answers_test = train_test_split(
        questions_rem, answers_rem, test_size=val_size_adjusted, random_state=42)

    return (questions_train, answers_train), (questions_val, answers_val), (questions_test, answers_test)

(train_questions, train_answers), (val_questions, val_answers), (test_questions, test_answers) = split_data(validated_questions, validated_answers)

# Output the sizes of each set
print(f"Training set size: {len(train_questions)}")
print(f"Validation set size: {len(val_questions)}")
print(f"Test set size: {len(test_questions)}")


Training set size: 117919
Validation set size: 33692
Test set size: 16846


# Tokenization and Data Preparation

Use DialoGPT as the pre-trained model

Tokenization: The tokenizer from the Hugging Face Transformers library is used to convert the conversations into tokenized input for the model.

Managing Context: Need to ensure that previous conversation turns are included when generating a response.

Padding and Truncation: We need to ensure the inputs are padded or truncated to a fixed length for batch processing.

In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import torch
from torch.utils.data import DataLoader, TensorDataset

# Load DialoGPT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token

# Load DialoGPT model
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

# Tokenize cleaned conversations
def tokenize_conversation(questions, answers, max_length=512):
    input_ids_list = []
    attention_masks_list = []
    labels_list = []
    skipped_pairs = 0  # Keep track of skipped pairs


    for question, answer in zip(questions, answers):

        # Tokenize question and answer separately, then concatenate token IDs
        # The question and answer are now tokenized separately, each with a maximum length of max_length // 2.
        # This avoids sequences that are too long when concatenating the question and answer.
        encoded_question = tokenizer(question, return_tensors='pt', max_length=max_length // 2, padding='max_length', truncation=True)
        encoded_answer = tokenizer(answer, return_tensors='pt', max_length=max_length // 2, padding='max_length', truncation=True)

        input_ids = torch.cat([encoded_question['input_ids'], encoded_answer['input_ids']], dim=1)  # Concatenate question and answer
        attention_mask = torch.cat([encoded_question['attention_mask'], encoded_answer['attention_mask']], dim=1)

        # Skip empty inputs (if any)
        if input_ids.size(1) == 0:  # Check if input has a valid sequence length
            skipped_pairs += 1
            continue  # Skip this pair if it's invalid

        # Set labels as input_ids with padding token ignored (-100)
        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding in labels

        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)
        labels_list.append(labels)

     # Check if any pairs were processed
    if not input_ids_list:
        raise ValueError(f"No valid question-answer pairs were processed. {skipped_pairs} pairs were skipped.")

    # Stack input IDs, attention masks, and labels to create tensors
    return torch.cat(input_ids_list, dim=0), torch.cat(attention_masks_list, dim=0), torch.cat(labels_list, dim=0)

# Example usage
questions = ["You're asking me out. That's so cute. What's your name again?", "Forget it."]
answers = ["No, no, it's my fault -- we didn't have a proper introduction ---", "Cameron."]

input_ids, attention_masks, labels = tokenize_conversation(questions, answers)

print(f"Input IDs: {input_ids}")
print(f"Attention Masks: {attention_masks}")
print(f"Labels: {labels}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Input IDs: tensor([[ 1639,   821,  4737,  ..., 50256, 50256, 50256],
        [ 1890,  1136,   340,  ..., 50256, 50256, 50256]])
Attention Masks: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Labels: tensor([[1639,  821, 4737,  ..., -100, -100, -100],
        [1890, 1136,  340,  ..., -100, -100, -100]])


In [15]:
# Tokenization for the training data
train_input_ids, train_attention_masks, train_labels = tokenize_conversation(train_questions, train_answers)
val_input_ids, val_attention_masks, val_labels = tokenize_conversation(val_questions, val_answers)
test_input_ids, test_attention_masks, test_labels = tokenize_conversation(test_questions, test_answers)

print(f"Tokenized Training {len(train_input_ids)} input id's conversations.")
print(f"Tokenized Training {len(train_attention_masks)} attention masks conversations.\n")

print(f"Tokenized Validation {len(val_input_ids)} input id's conversations.")
print(f"Tokenized Validation {len(val_attention_masks)} attention masks conversations.\n")

print(f"Tokenized Test {len(test_input_ids)} input id's conversations.")
print(f"Tokenized Test {len(test_attention_masks)} attention masks conversations.\n")

Tokenized Training 117919 input id's conversations.
Tokenized Training 117919 attention masks conversations.

Tokenized Validation 33692 input id's conversations.
Tokenized Validation 33692 attention masks conversations.

Tokenized Test 16846 input id's conversations.
Tokenized Test 16846 attention masks conversations.



# Create Data Loaders for the Train, Validation and Test Data

In [16]:
from torch.utils.data import DataLoader, TensorDataset

# Create dataset and DataLoader for batching
def create_dataloader(input_ids, attention_mask, labels, batch_size=16):
    dataset = TensorDataset(input_ids, attention_mask, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Running using CPU initially, so choosing batch size of 16
# Smaller batch size can lead to noisier gradient updates but sometimes result in better generalization.
# Larger batch sizes have more stable gradient updates but need more memory , learning rate needs tuning.
batch_size = 32

# Tokenized conversations
# Create DataLoader for each dataset
train_dataloader = create_dataloader(train_input_ids, train_attention_masks, train_labels, batch_size=batch_size)
val_dataloader = create_dataloader(val_input_ids, val_attention_masks, val_labels, batch_size=batch_size)
test_dataloader = create_dataloader(test_input_ids, test_attention_masks, test_labels, batch_size=batch_size)

print(f"Training loader size: {len(train_dataloader.dataset)}")
print(f"Validation loader size: {len(val_dataloader.dataset)}")
print(f"Test loader size: {len(test_dataloader.dataset)}")

print(f"Number of steps in each epoch: {len(train_dataloader.dataset)/batch_size}")

Training loader size: 117919
Validation loader size: 33692
Test loader size: 16846
Number of steps in each epoch: 3684.96875


# Validate Data After Tokenization:

Print out some tokenized examples to ensure that the tokenization process is working as expected, and there are no empty sequences or overly long sequences:

In [17]:
for i, (input_ids, attention_mask, labels) in enumerate(train_dataloader):
    print(f"Batch {i} Tokenized Input Lengths: {[len(seq) for seq in input_ids]}")
    break  # Only print the first batch

Batch 0 Tokenized Input Lengths: [512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512]


# Training loop

Using manual training loop for better control over the training process instead of using HuggingFace's Trainer and TrainingArguments.


# Evaluation:

Evaluation metrics like BLEU score or ROUGE Score to measure the quality of the generated conversations.

BLEU Score: This metric is commonly used for evaluating machine translation and is calculated by comparing n-grams of the generated response against the reference responses.

ROUGE Score: This metric is often used for evaluating text summarization and compares the overlap of n-grams between the generated response and reference texts.


In [24]:
import torch
from torch.optim import AdamW  # Import AdamW from PyTorch to avoid warning when imported using transformers
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time  # Import the time module

In [19]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Function to calculate BLEU score
def calculate_bleu(references, candidates):
    reference_tokens = [[ref.split()] for ref in references]  # Tokenize the references
    candidate_tokens = [cand.split() for cand in candidates]

    # Calculate BLEU score for all predictions
    bleu_scores = [sentence_bleu(ref, cand) for ref, cand in zip(reference_tokens, candidate_tokens)]
    avg_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    return avg_bleu_score

# Function to calculate ROUGE score
def calculate_rouge(references, candidates):
    rouge = Rouge()
    scores = rouge.get_scores(candidates, references, avg=True)
    return scores



In [None]:
# Example usage
reference_responses = ["I am going to the store.", "I went to the store."]
generated_response = "I am going to the shop."

# Calculate BLEU score
bleu_score = calculate_bleu(reference_responses, generated_response)
print(f"BLEU Score: {bleu_score:.4f}")

# Calculate ROUGE score
rouge_scores = calculate_rouge(reference_responses[0], generated_response)  # Just using the first reference
print(f"ROUGE Scores: {rouge_scores}")

BLEU Score: 0.0000
ROUGE Scores: {'rouge-1': {'r': 0.8333333333333334, 'p': 0.8333333333333334, 'f': 0.8333333283333335}, 'rouge-2': {'r': 0.8, 'p': 0.8, 'f': 0.7999999950000002}, 'rouge-l': {'r': 0.8333333333333334, 'p': 0.8333333333333334, 'f': 0.8333333283333335}}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [25]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import numpy as np

In [37]:
def validate_model(model, val_loader, tokenizer):
    model.eval()  # Set model to evaluation mode
    total_val_loss = 0

    all_numerical_preds = []  # For accuracy, precision, recall, F1
    all_numerical_labels = []
    all_preds = [] # For BLEU/ROUGE
    all_labels = []
    all_references = []  # For BLEU and ROUGE evaluation
    all_candidates = []

    with torch.no_grad():
        for batch in val_loader:
            # Unpack the batch and move to device
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            # Get predictions (logits) and decode generated sequences
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            # # Add logging to inspect predictions and labels
            # print(f"Step {step}:")
            # print(f"Predictions shape: {predictions.shape}, Labels shape: {labels.shape}")
            # print(f"First 10 predictions: {predictions[:10]}")
            # print(f"First 10 labels: {labels[:10]}")

            # Check for NaN values or large values in predictions and labels
            if torch.isnan(predictions).any() or torch.isnan(labels).any():
                print(f"NaN values detected at Step {step}, skipping this batch.")
                continue

            if predictions.max() >= 1e5 or labels.max() >= 1e5:
                print(f"Unexpectedly large values detected at Step {step}, skipping this batch.")
                continue

            # Decode predictions and labels to text for BLEU/ROUGE evaluation
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(torch.clamp(labels, 0, tokenizer.vocab_size - 1), skip_special_tokens=True)

            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)
            all_references.extend(decoded_labels)
            all_candidates.extend(decoded_preds)
            # **Keep numerical predictions and labels for accuracy/precision/recall/F1**
            all_numerical_preds.extend(predictions.flatten().cpu().numpy())
            all_numerical_labels.extend(labels.flatten().cpu().numpy())


    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Log shapes before computing metrics
    print(f"All labels shape: {len(all_labels)}, All predictions shape: {len(all_preds)}")

    # **Calculate accuracy, precision, recall, and F1 using numerical predictions and labels**
    accuracy = accuracy_score(all_numerical_labels, all_numerical_preds)
    precision = precision_score(all_numerical_labels, all_numerical_preds, average='weighted')
    recall = recall_score(all_numerical_labels, all_numerical_preds, average='weighted')
    f1 = f1_score(all_numerical_labels, all_numerical_preds, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted', zero_division=1)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    # Calculate BLEU and ROUGE scores for generated responses
    avg_bleu_score = calculate_bleu(all_labels, all_preds)
    print(f"BLEU Score: {avg_bleu_score:.4f}")

    rouge_scores = calculate_rouge(all_labels, all_preds)
    print(f"ROUGE Scores: {rouge_scores}")



In [None]:
def check_for_nan(tensor, name="tensor"):
    if torch.isnan(tensor).any():
        print(f"Found NaN values in {name}.")
        return True
    return False

https://www.baeldung.com/cs/ml-training-nan-errors-fix#:~:text=These%20sources%20include%20data%20errors,aren't%20susceptible%20to%20NaNs.

https://neptune.ai/blog/understanding-gradient-clipping-and-how-it-can-fix-exploding-gradients-problem

Backpropagation calculates the gradients of the cost function w.r.t. the weights and biases in the network.

It tells you about all the changes you need to make to your weights to minimize the cost function (it’s actually -1*∇ to see the steepest decrease, and +∇ would give you the steepest increase in the cost function).

**Vanishing Gradients:**
The translation of the effect of a change in cost function (C) to the weight in an initial layer, or the norm of the gradient, becomes so small due to increased model complexity with more hidden units that it becomes zero after a certain point. This is what we call vanishing gradients.

This hampers the learning of the model. The weights can no longer contribute to the reduction in cost function (C)  and go unchanged, affecting the network in the forward pass and eventually stalling the model.

**Exploding gradients**
On the other hand, the exploding gradient problem refers to a large increase in the norm of the gradient during training.

Such events are caused by an explosion of long-term components, which can grow exponentially more than short-term ones. This results in an unstable network that, at best, cannot learn from the training data, making the gradient descent step impossible to execute.


**Gradient clipping** is a technique used to stabilize the training of neural networks by rescaling the error derivative to a threshold.
This prevents the gradients from becoming too large, which can cause the model to diverge and fail to converge to a good solution.

Here's how gradient clipping works:
Set a threshold: Define a minimum and maximum threshold.
Calculate the norm: Calculate the norm of the gradients.
Scale the gradients: If the norm exceeds the threshold, scale down the gradients proportionally to meet the norm threshold.
Update the weights: Use the clipped gradients to update the weights.

Gradient clipping can be performed in two ways:
**Clipping by value**: Define a minimum and maximum threshold.
**Clipping by norm**: Set a maximum threshold for the norm of the gradients

In [21]:
import gc
from transformers import AdamW, get_linear_schedule_with_warmup
import torch

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to the CPU
model.to(device)

# Apply gradient checkpointing to the model
model.gradient_checkpointing_enable()

# Set up the optimizer (AdamW is a common choice for transformer models)
# Disable weight decay
optimizer = AdamW(model.parameters(), lr=5e-6,weight_decay=0.0) # Reduced the learning rate from 5e-5 due to nan loss

total_epochs = 3

save_every_n_steps = 100

# Define the number of warm-up steps and total training steps
total_training_steps = len(train_dataloader) * total_epochs  # Total steps = batches per epoch * number of epochs
warmup_steps = int(0.1 * total_training_steps)  # 10% of total steps as warm-up

# Define learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_training_steps)
# Define training function
def train_model(model, train_dataloader, val_dataloader, optimizer, epochs=3,  save_every_n_steps=100, start_epoch=0, start_step=0):

    # Define the path where the models will be saved
    model_save_path = "/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/"
    print(epochs)
    print(start_epoch)

    for epoch in range(start_epoch, epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}:")
        model.train()  # Set model to training mode

        total_train_loss = 0
        start_time = time.time()  # Start time for the epoch
        gradient_accumulation_steps = 2  # Accumulate gradients over 2 batches

        # Training loop on  Train Data
        for step, batch in enumerate(train_dataloader, start=start_step):
            optimizer.zero_grad() # Clear previous gradients

            # Unpack the batch and move to the device
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # Check for NaN values in input IDs
            # if torch.isnan(input_ids).any():
            if check_for_nan(input_ids, "input_ids") or check_for_nan(attention_mask, "attention_mask"):
                print("Input IDs contain NaN values. Stopping training.")
                print("NaN detected in inputs, stopping training.")
                return

            # Forward pass and compute loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # NaN check for loss
            if torch.isnan(loss):
                print(f"NaN loss encountered at Step {step} in Epoch {epoch+1}. Stopping training.")
                print(f"Input IDs: {input_ids}")
                print(f"Attention Mask: {attention_mask}")
                print(f"Labels: {labels}")
                return  # Stop training to prevent further NaN propagation

            total_train_loss += loss.item()
            loss.backward() # Backward pass to calculate gradients

            # Check if gradients are NaN
            for param in model.parameters():
                if param.grad is not None and torch.isnan(param.grad).any():
                    print("Encountered NaN gradients. Stopping training.")
                    return

            # Gradient clipping to prevent gradient explosion
            if (step + 1) % gradient_accumulation_steps == 0:
              # To prevent Gradient explosion due to large batch sizes, implement gradient clipping
              torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
              # Update parameters
              optimizer.step()
              optimizer.zero_grad()
              # Step the scheduler to adjust the learning rate
              scheduler.step()

            # Print every 10th step
            if step % 10 == 0 and step > 0:
                print(f"Epoch {epoch+1},  Step {step}: Loss = {loss.item():.4f}")

            # Save model every 100 steps (or whatever value save_every_n_steps is set to)
            if (step + 1) % save_every_n_steps == 0:
                model_save_name = f'fine_tuned_dialoGPT_epoch{epoch+1}_step{step+1}.pt'
                model_save_full_path = os.path.join(model_save_path, model_save_name)
                # Save model state dict, optimizer state, and scheduler state
                torch.save({
                    'epoch': epoch + 1,
                    'step': step + 1,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                }, model_save_full_path)

                print(f"Checkpoint saved: {model_save_name}")

            del outputs, loss  # Delete variables after use
            gc.collect()
            torch.cuda.empty_cache()

        # Epoch-level reporting
        avg_train_loss = total_train_loss / len(train_dataloader)
        # Calculate the time taken for the epoch
        elapsed_time = time.time() - start_time
        print(f"Epoch {epoch+1} completed. Training Loss: {avg_train_loss:.4f}. Time for epoch: {elapsed_time:.2f} seconds")

        # Validation after each epoch using validation data
        validate_model(model, val_dataloader, tokenizer)

        model_save_name = 'fine_tuned_dialoGPT{epoch+1}_final.pt'
        model_save_full_path = os.path.join(model_save_path, model_save_name)


        #Save model state dict, optimizer state, and scheduler state
        torch.save({
                    'epoch': epoch + 1,
                    'step': step + 1,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                }, model_save_full_path)
        print(f"Checkpoint saved: {model_save_name}")

        model_save_name = 'fine_tuned_dialoGPT_final.pt'
        path = F"/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/{model_save_name}"
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)
        print(f"Model {model_save_name} and tokenizer saved to {path}\n")




In [None]:
# Define the path where the models will be saved
model_save_path = "/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/"
model_save_name = 'LastGoodModel.pt'
model_save_full_path = os.path.join(model_save_path, model_save_name)
#path = F"/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/{model_save_name}"
#Save model state dict, optimizer state, and scheduler state
torch.save({'epoch': 3,
            'step': 3601,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            }, model_save_full_path)
print(f"Checkpoint saved: {model_save_name}")

Checkpoint saved: LastGoodModel.pt


# Train the model

In [None]:
# Train the model with 3 epochs
train_model(model, train_dataloader, val_dataloader, optimizer, epochs=total_epochs, save_every_n_steps=100)


Epoch 1/3:


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch 1,  Step 10: Loss = 10.6841
Epoch 1,  Step 20: Loss = 10.3804
Epoch 1,  Step 30: Loss = 10.9327
Epoch 1,  Step 40: Loss = 10.9661
Epoch 1,  Step 50: Loss = 10.4565
Epoch 1,  Step 60: Loss = 10.5978
Epoch 1,  Step 70: Loss = 10.1374
Epoch 1,  Step 80: Loss = 10.8555
Epoch 1,  Step 90: Loss = 9.9942
Checkpoint saved: fine_tuned_dialoGPT_epoch1_step100.pt
Epoch 1,  Step 100: Loss = 11.5294
Epoch 1,  Step 110: Loss = 9.9718
Epoch 1,  Step 120: Loss = 10.3927
Epoch 1,  Step 130: Loss = 10.4790
Epoch 1,  Step 140: Loss = 10.4229
Epoch 1,  Step 150: Loss = 10.7004
Epoch 1,  Step 160: Loss = 10.5868
Epoch 1,  Step 170: Loss = 10.8074
Epoch 1,  Step 180: Loss = 9.8277
Epoch 1,  Step 190: Loss = 10.0279
Checkpoint saved: fine_tuned_dialoGPT_epoch1_step200.pt
Epoch 1,  Step 200: Loss = 10.4360
Epoch 1,  Step 210: Loss = 9.7197
Epoch 1,  Step 220: Loss = 10.2452
Epoch 1,  Step 230: Loss = 9.8166
Epoch 1,  Step 240: Loss = 10.1381
Epoch 1,  Step 250: Loss = 10.4864
Epoch 1,  Step 260: Loss = 

OverflowError: out of range integral type conversion attempted

# To Resume Training from saved checkpoint

In [None]:
# Load checkpoint to resume training
checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch2_step3600.pt', map_location=torch.device(device))
#checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/LastGoodModel.pt', map_location=torch.device(device))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
start_epoch = checkpoint['epoch']  # Start from the next epoch
start_step = checkpoint['step']
print(f"Loaded checkpoint from epoch {start_epoch} and step {start_step}")

  checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch1_step3600.pt', map_location=torch.device(device))


Loaded checkpoint from epoch 1 and step 3600


In [None]:
# Now continue training from this checkpoint
remaining_epochs = total_epochs - start_epoch
# Train the model for remaining epochs
train_model(model, train_dataloader, val_dataloader, optimizer, epochs=remaining_epochs, save_every_n_steps=100, start_epoch=start_epoch, start_step=0)

In [None]:
# Train the model for remaining epochs
train_model(model, train_dataloader, val_dataloader, optimizer, epochs=3, save_every_n_steps=100, start_epoch=2, start_step=0)

3
2

Epoch 3/3:


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [2093,  262, 1808,  ..., -100, -100, -100],
        [1996,  683,  287,  ..., -100, -100, -100],
        ...,
        [3003,  318,  339,  ..., -100, -100, -100],
        [ 732,  460, 5671,  ..., -100, -100, -100],
        [  72,  550,  284,  ..., -100, -100, -100]], device='cuda:0')
Step 3600:
Predictions shape: torch.Size([32, 512]), Labels shape: torch.Size([32, 512])
First 10 predictions: tensor([[ 318,  257,  922,  ...,   72,   72,   72],
        [ 318,  407,  612,  ...,   72,   72,   72],
        [ 318, 1312,  716,  ...,   72,   72,   72],
        ...,
        [ 318, 8161, 1312,  ...,   72,   72,   72],
        [ 318,  837,  318,  ...,   72,   72,   72],
        [ 318, 3729, 4236,  ...,   72,   72,   72]], device='cuda:0')
First 10 labels: tensor([[ 7091,   318,   257,  ...,  -100,  -100,  -100],
        [   72,   481,   307,  ...,  -100,  -100,  -100],
        [ 3919,   764,  1312,  ...,  -100,  -100,  -100],

ValueError: too many dimensions 'str'

# Invoke all the Validate Model Method

There were errors after the training was complete but when validate_model() was invoked.

So, Loading the trained models checkpoints and invoking validate_model() method.

# Load the Final Check point model stored after Epoch 1

In [35]:
checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch1_step3600.pt', map_location=torch.device(device))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
start_epoch = checkpoint['epoch']  # Start from the next epoch
start_step = checkpoint['step']
print(f"Loaded checkpoint from epoch {start_epoch} and step {start_step}")

  checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch1_step3600.pt', map_location=torch.device(device))


Loaded checkpoint from epoch 1 and step 3600


In [36]:
# Validation after each epoch using validation data
validate_model(model, val_dataloader, tokenizer)

Validation Loss: 3.1828
All labels shape: 33692, All predictions shape: 33692


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.0003
Precision: 0.0004
Recall: 0.0003
F1-Score: 0.0003


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score: 0.0022
ROUGE Scores: {'rouge-1': {'r': 0.21568273777184954, 'p': 0.23684904593101888, 'f': 0.22416692011604597}, 'rouge-2': {'r': 0.035984681811857544, 'p': 0.03612941855556099, 'f': 0.03588286511975884}, 'rouge-l': {'r': 0.2057208032520116, 'p': 0.2259841885755017, 'f': 0.21384349199347596}}


# Load the Final Check point model stored after Epoch 2

In [29]:
checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch2_step3600.pt', map_location=torch.device(device))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
start_epoch = checkpoint['epoch']  # Start from the next epoch
start_step = checkpoint['step']
print(f"Loaded checkpoint from epoch {start_epoch} and step {start_step}")

  checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch2_step3600.pt', map_location=torch.device(device))


Loaded checkpoint from epoch 2 and step 3600


In [30]:
# Validation after each epoch using validation data
validate_model(model, val_dataloader, tokenizer)

Validation Loss: 3.0357
All labels shape: 33692, All predictions shape: 33692


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.0001
Precision: 0.0002
Recall: 0.0001
F1-Score: 0.0001


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score: 0.0025
ROUGE Scores: {'rouge-1': {'r': 0.22056681247279245, 'p': 0.24217274520621157, 'f': 0.22916424137472002}, 'rouge-2': {'r': 0.03876578533257116, 'p': 0.03906942417641923, 'f': 0.03872543260017414}, 'rouge-l': {'r': 0.2105528441593231, 'p': 0.23122352765555207, 'f': 0.21877424695655226}}


# Load the Final Check point model stored after Epoch 3

In [38]:
checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch3_step3600.pt', map_location=torch.device(device))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
start_epoch = checkpoint['epoch']  # Start from the next epoch
start_step = checkpoint['step']
print(f"Loaded checkpoint from epoch {start_epoch} and step {start_step}")

  checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/ChatFlixModels/fine_tuned_dialoGPT_epoch3_step3600.pt', map_location=torch.device(device))


Loaded checkpoint from epoch 3 and step 3600


In [39]:
# Validation after each epoch using validation data
validate_model(model, val_dataloader, tokenizer)

Validation Loss: 2.9888
All labels shape: 33692, All predictions shape: 33692


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.0001
Precision: 0.0001
Recall: 0.0001
F1-Score: 0.0001
Precision: 1.0000, Recall: 0.0000, F1-Score: 0.0000


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score: 0.0026
ROUGE Scores: {'rouge-1': {'r': 0.22213744575248304, 'p': 0.24232078037700508, 'f': 0.23012233936889143}, 'rouge-2': {'r': 0.03991810325956266, 'p': 0.040058987839428545, 'f': 0.03979253888396658}, 'rouge-l': {'r': 0.21220979668777634, 'p': 0.231532560480215, 'f': 0.21984847815055616}}


# Inference:

Build a chatbot interface where the model generates responses based on user inputs and conversation history.

In [None]:
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the pre-trained DialoGPT model and tokenizer
# model_name = "microsoft/DialoGPT-small"
model_name = r'./fine_tuned_dialoGPT'  # or path to your trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to generate a response using the trained DialoGPT model
def generate_response(message, history):pri
    # Tokenize the input message and convert it to input_ids
    new_input_ids = tokenizer.encode(message + tokenizer.eos_token, return_tensors='pt').to(device)

    # Concatenate the new input with the history of the conversation
    bot_input_ids = new_input_ids
    if history:
        history_ids = tokenizer.encode(history, return_tensors='pt').to(device)
        bot_input_ids = torch.cat([history_ids, new_input_ids], dim=-1)

    # Generate a response
    response_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated response and add it to the history
    response = tokenizer.decode(response_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)

    # Update the history
    history = history + " " + message + " " + response

    return response, history  # Return both response and updated history

# Initialize Gradio ChatInterface
demo = gr.ChatInterface(
    fn=generate_response,
    examples=[{"text": "Hello", "files": []}],
    title="ChatBot Powered by DialoGPT",
    description="A chatbot based on the DialoGPT model, fine-tuned for multi-turn conversations.",
    multimodal=False
)

# Launch the Gradio demo
demo.launch()


In [30]:
import torch
def test():
    #tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    
    #model_name = r'/fine_tuned_dialoGPT/fine_tuned_dialoGPT_epoch3_step3500'  # or path to your trained model
    #model = TFGPT2LMHeadModel.from_pretrained("chatbot_model")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    

    while True:
        input_text = input("User: ")
         # Tokenize the input message and convert it to input_ids
        input_tokenized = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors='pt').to(device)
        output_ids = model.generate(input_tokenized, max_length=1000, pad_token_id=tokenizer.eos_token_id)
        # Decode the generated response and add it to the history
        output_text = tokenizer.decode(output_ids[:, input_tokenized.shape[-1]:][0], skip_special_tokens=True)
        # input_tokenized = tokenizer.encode(input_text, add_special_tokens=False)
        # input_ids = tf.keras.preprocessing.sequence.pad_sequences([input_tokenized], maxlen=max_length, padding="post")
        # output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
        # output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print("Bot:", output_text)

In [31]:
test()


KeyboardInterrupt: Interrupted by user

In [29]:
print(model_name)

microsoft/DialoGPT-small
