In [2]:
import os

print("Contents of /kaggle/input:")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Contents of /kaggle/input:
/kaggle/input/cornell-moviedialog-corpus/movie_conversations.txt
/kaggle/input/cornell-moviedialog-corpus/README.txt
/kaggle/input/cornell-moviedialog-corpus/chameleons.pdf
/kaggle/input/cornell-moviedialog-corpus/movie_titles_metadata.txt
/kaggle/input/cornell-moviedialog-corpus/movie_characters_metadata.txt
/kaggle/input/cornell-moviedialog-corpus/movie_lines.txt
/kaggle/input/cornell-moviedialog-corpus/.DS_Store
/kaggle/input/cornell-moviedialog-corpus/raw_script_urls.txt


### 50K Samples Data Load from Cornell

In [15]:
import re
from pathlib import Path
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from datasets import Dataset
from tqdm import tqdm
import time
import numpy as np

def load_cornell_data(num_samples=50000):
    base_path = Path("/kaggle/input/cornell-moviedialog-corpus")
    
    # Load movie lines
    lines = {}
    with open(base_path / 'movie_lines.txt', 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 5:
                lines[parts[0]] = parts[4]
    
    # Load conversations
    conversations = []
    with open(base_path / 'movie_conversations.txt', 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 4:
                conv = eval(parts[3])
                conversations.append(conv)
    
    # Load movie metadata
    movie_metadata = {}
    with open(base_path / 'movie_titles_metadata.txt', 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 6:
                movie_metadata[parts[0]] = parts[1]  # Movie name
    
    # Create input-output pairs with context
    pairs = []
    for conversation in conversations:
        for i in range(len(conversation) - 1):
            if conversation[i] in lines and conversation[i+1] in lines:
                input_text = lines[conversation[i]]
                target_text = lines[conversation[i+1]]
                movie_id = conversation[i].split('_')[0]  # Extract movie ID from the line ID
                movie_name = movie_metadata.get(movie_id, "Unknown Movie")
                context = f"Movie: {movie_name}\nLine: "
                pairs.append((context + input_text, target_text))
    
    # Shuffle and select subset
    random.shuffle(pairs)
    pairs = pairs[:num_samples]
    
    input_texts, target_texts = zip(*pairs)
    
    return list(input_texts), list(target_texts)

# Load data
inputs, targets = load_cornell_data(num_samples=50000)
print(f"Loaded {len(inputs)} conversation pairs")
print("Example pair:")
print(f"Input: {inputs[0]}")
print(f"Target: {targets[0]}")





Loaded 50000 conversation pairs
Example pair:
Input: Movie: Unknown Movie
Line: You can't go in there. They know you're with Ruiz.
Target: You got that right.


In [17]:
# Set up model and tokenizer
model_name = "microsoft/DialoGPT-medium"  # Consider trying "facebook/blenderbot-400M-distill" as an alternative
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")


Using device: cuda
GPU Available: True
GPU Device Name: Tesla P100-PCIE-16GB


### Dialogpt Medium

In [18]:
# Prepare dataset
dataset = Dataset.from_dict({"input": inputs, "target": targets})

def tokenize_function(examples):
    inputs = [inp + tokenizer.eos_token + tgt + tokenizer.eos_token for inp, tgt in zip(examples["input"], examples["target"])]
    return tokenizer(inputs, truncation=True, padding=False, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Use dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False,
    pad_to_multiple_of=8  # Optimize for tensor cores
)

train_dataloader = DataLoader(
    tokenized_dataset, 
    shuffle=True, 
    batch_size=8, 
    collate_fn=data_collator
)

# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

# Training loop
model.train()
total_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    total_loss = 0
    
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch+1}")
    
    for i, batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(train_dataloader)
    epoch_time = time.time() - epoch_start_time
    
    print(f"Epoch {epoch+1}/{num_epochs} completed in {epoch_time:.2f} seconds. Average Loss: {avg_loss:.4f}")

total_time = time.time() - total_start_time
print(f"Training completed in {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

# Save the model
model.save_pretrained("./fine_tuned_dialogpt_medium")
tokenizer.save_pretrained("./fine_tuned_dialogpt_medium")


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Epoch 1: 100%|██████████| 6250/6250 [30:43<00:00,  3.39it/s, loss=3.0547]


Epoch 1/5 completed in 1843.16 seconds. Average Loss: 2.9671


Epoch 2: 100%|██████████| 6250/6250 [30:40<00:00,  3.40it/s, loss=3.2201]


Epoch 2/5 completed in 1840.63 seconds. Average Loss: 2.7111


Epoch 3: 100%|██████████| 6250/6250 [30:45<00:00,  3.39it/s, loss=2.3707]


Epoch 3/5 completed in 1845.73 seconds. Average Loss: 2.5907


Epoch 4: 100%|██████████| 6250/6250 [30:48<00:00,  3.38it/s, loss=2.2364]


Epoch 4/5 completed in 1848.17 seconds. Average Loss: 2.5063


Epoch 5: 100%|██████████| 6250/6250 [30:46<00:00,  3.38it/s, loss=2.5911]


Epoch 5/5 completed in 1846.56 seconds. Average Loss: 2.4535
Training completed in 9224.25 seconds (153.74 minutes)


('./fine_tuned_dialogpt_medium/tokenizer_config.json',
 './fine_tuned_dialogpt_medium/special_tokens_map.json',
 './fine_tuned_dialogpt_medium/vocab.json',
 './fine_tuned_dialogpt_medium/merges.txt',
 './fine_tuned_dialogpt_medium/added_tokens.json',
 './fine_tuned_dialogpt_medium/tokenizer.json')

In [21]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./fine_tuned_dialogpt_medium")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_dialogpt_medium")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def generate_response(prompt, max_length=100):
    input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)
    
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

# Update your prompts to be more specific
movie_test_prompts = [
    "Question: Why does Cameron suggest that Bianca needs to learn how to lie?",
    "Statement: Describe how Bianca feels about becoming a persona she can't quit.",
    "Question: What specific comment did Guillermo make about Bianca's hair color?",
    "Statement: Explain Kat's bad experience with a guy who broke up with her.",
    "Question: How does Bianca react to Joey's aspirations in his modeling career?",
]

# Generate responses and evaluate
for prompt in movie_test_prompts:
    response = generate_response(prompt)
    print(f"Prompt: {prompt}")
    print(f"Generated response: {response}")
    print()

# Calculate perplexity if needed
def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    return torch.exp(outputs.loss).item()

# Calculate average perplexity
total_perplexity = 0
for prompt in movie_test_prompts:
    full_text = prompt + " " + generate_response(prompt)
    perplexity = calculate_perplexity(full_text)
    total_perplexity += perplexity
    print(f"Perplexity for prompt: {perplexity:.2f}")

avg_perplexity = total_perplexity / len(movie_test_prompts)
print(f"\nAverage Perplexity: {avg_perplexity:.2f}")

Prompt: Question: Why does Cameron suggest that Bianca needs to learn how to lie?
Generated response: Because she was telling the truth when she told you she was pregnant. She told you it was because she was tired. And that's what happened. But you've got to think that there's something else going on here. I mean, she's got to be lying. She's got that look in her eyes that's just weird. And then she's like, oh yeah... she's lying. And I

Prompt: Statement: Describe how Bianca feels about becoming a persona she can't quit.
Generated response: I want to be a big show like Bianca. I want to make some big plays. I have to show off and do what I can. I'm just not good enough to be Bianca without being a big play. I've tried that in every sport I've played, in every competition. But I can't do it in this one. I can barely do it as myself. I don't

Prompt: Question: What specific comment did Guillermo make about Bianca's hair color?
Generated response: He said it was too light brown for her. 

In [28]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./fine_tuned_dialogpt_medium")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_dialogpt_medium")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def generate_response(prompt, max_length=150):
    input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)
    
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

def calculate_bleu(reference, candidate):
    reference_tokens = word_tokenize(reference.lower())
    candidate_tokens = word_tokenize(candidate.lower())
    return sentence_bleu([reference_tokens], candidate_tokens)

def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    return torch.exp(outputs.loss).item()

# Updated test prompts with more context
movie_test_prompts = [
    "In the movie '10 Things I Hate About You', why does Cameron suggest that Bianca needs to learn how to lie?",
    "In '10 Things I Hate About You', how does Kat feel about Bianca's attitude towards dating, considering their father's rule about Kat dating first?",
    "During a conversation about Bianca's appearance in '10 Things I Hate About You', what specific comment did the character Guillermo make about Bianca's hair color?",
    "In a scene from '10 Things I Hate About You', how does Patrick react when Kat questions his motives for being with her?",
    "In '10 Things I Hate About You', how does Joey respond when Kat confronts him and tells him to leave her sister Bianca alone?"
]

reference_answers = [
    "Cameron suggests Bianca needs to learn how to lie because she's too honest and straightforward, which might not always work in her favor in social situations.",
    "Kat feels frustrated that Bianca's desire to date affects her own freedom and choices, as their father won't allow Bianca to date until Kat does.",
    "Guillermo said that if Bianca's hair gets any lighter, she'll look like an extra on 90210.",
    "Patrick gets defensive and asks Kat if he needs to have a motive to be with her, implying that he genuinely likes her.",
    "Joey dismisses Kat's request and asks why he would leave Bianca alone, showing his lack of respect for Kat's wishes."
]

# Evaluate responses
total_bleu = 0
total_perplexity = 0

for prompt, reference in zip(movie_test_prompts, reference_answers):
    generated_response = generate_response(prompt)
    bleu_score = calculate_bleu(reference, generated_response)
    perplexity = calculate_perplexity(prompt + " " + generated_response)
    
    total_bleu += bleu_score
    total_perplexity += perplexity
    
    print(f"Prompt: {prompt}")
    print(f"Generated response: {generated_response}")
    print(f"Reference: {reference}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"Perplexity: {perplexity:.4f}")
    print()

# Calculate average scores
avg_bleu = total_bleu / len(movie_test_prompts)
avg_perplexity = total_perplexity / len(movie_test_prompts)
print(f"Average BLEU Score: {avg_bleu:.4f}")
print(f"Average Perplexity: {avg_perplexity:.4f}")

Prompt: In the movie '10 Things I Hate About You', why does Cameron suggest that Bianca needs to learn how to lie?
Generated response: Because she was lying when she said she couldn't get married. She can't lie now. She was lying before she even told Bianca she was leaving. That's the whole point of the film. She's lying now because she's lying before Bianca even told her she's leaving. She lied before she could even say goodbye. That is the whole basis of the whole film. That means nothing to her, so she's not going to say goodbye to anyone. She lies now because it's the other side of the room. That proves that Biancas lying is still real and she's still there. It
Reference: Cameron suggests Bianca needs to learn how to lie because she's too honest and straightforward, which might not always work in her favor in social situations.
BLEU Score: 0.0610
Perplexity: 9.5903

Prompt: In '10 Things I Hate About You', how does Kat feel about Bianca's attitude towards dating, considering their 

In [34]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./fine_tuned_dialogpt_medium")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_dialogpt_medium")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Constants
MAX_HISTORY_TURNS = 5
MAX_HISTORY_TOKENS = 512

def generate_response(prompt, conversation_history, max_length=50):
    # Construct the full prompt with conversation history
    full_prompt = construct_prompt(conversation_history, prompt)
    
    input_ids = tokenizer.encode(full_prompt + tokenizer.eos_token, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)
    
    # Truncate if the input is too long
    if input_ids.shape[1] > MAX_HISTORY_TOKENS:
        input_ids = input_ids[:, -MAX_HISTORY_TOKENS:]
        attention_mask = attention_mask[:, -MAX_HISTORY_TOKENS:]
    
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=input_ids.shape[1] + max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response[len(full_prompt):].strip()

def construct_prompt(conversation_history, current_prompt):
    prompt_parts = [
        "The following is a conversation about movies, particularly '10 Things I Hate About You'. Respond in the style of the movie's characters:",
        *conversation_history[-MAX_HISTORY_TURNS:],
        f"Human: {current_prompt}",
        "AI:"
    ]
    return "\n".join(prompt_parts)

def chat():
    conversation_history = []
    print("Chatbot: Hello! Let's talk about '10 Things I Hate About You'. What would you like to know?")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("Chatbot: Goodbye! It was nice chatting with you about '10 Things I Hate About You'.")
            break
        
        response = generate_response(user_input, conversation_history)
        print(f"Chatbot: {response}")
        
        # Update conversation history
        conversation_history.append(f"Human: {user_input}")
        conversation_history.append(f"AI: {response}")
        
        # Keep only the last MAX_HISTORY_TURNS turns
        if len(conversation_history) > MAX_HISTORY_TURNS * 2:
            conversation_history = conversation_history[-MAX_HISTORY_TURNS * 2:]

if __name__ == "__main__":
    chat()

Chatbot: Hello! Let's talk about '10 Things I Hate About You'. What would you like to know?


You:  what is the capital of united states?


Chatbot: Washington D.C.  "I hate all movies with a passion."  And you must be a very religious man. You must be very familiar with the phrase "The more you preach the more you hate."  Now what is that supposed to


You:  You are correct. 


Chatbot: "The more I preach the better I get."  I must have been preaching for years.  I'm not sure I even know what that phrase means.  Now, what is it?  Do you know what it means?  What do


KeyboardInterrupt: Interrupted by user