#############################################################################################
## **AAI-520**                                                                                 #
## **Final Project - Group 6**                                                                 #
## **Chatbot for Movie Info utilizing the Cornell Movie Dialogs Corpus**        

This Jupyter Notebook is used to generate the training dataset that will be used to the train the LLM ChatBot

#############################################################################################

In [51]:
#@title 1: Load the related Libraries
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import codecs
import csv
import os
import pandas as pd

##2: Load Movie Data Corpus

In [52]:
is_on_colab = False

google_drive = "/content/drive/MyDrive/AAI-520/Final/Data"
local_dir = "./Dataset/Cornell_Movie_Dialog_Corpus/"

dataset_dir = local_dir
if (is_on_colab):
    dataset_dir = google_drive

train_dir = dataset_dir + "../Train/"
if not os.path.exists(train_dir):
    os.makedirs(train_dir)

In [53]:
#@title 2.1: Load line_data
#Load the Line file

def loadLines(filePath, fields):
    """
    Args:
        filePath (str): full path to the file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each line
    """
    lines = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]

            lines[lineObj['lineID']] = lineObj

    return lines

# Usage example
fields_to_extract = ['lineID', 'characterID', 'movieID', 'character', 'text']
file_path = dataset_dir + "/movie_lines.txt"
lines_data = loadLines(file_path, fields_to_extract)

In [54]:
#@title 2.2: Load the charachter_data
def loadCharacterMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the character metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each character
    """
    characters = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            characterObj = {}
            for i, field in enumerate(fields):
                characterObj[field] = values[i]

            characters[characterObj['characterID']] = characterObj

    return characters

# Usage example
character_fields_to_extract = ['characterID', 'characterName', 'movieID', 'movieTitle', 'gender', 'position']
character_file_path = dataset_dir + "/movie_characters_metadata.txt"
character_data = loadCharacterMetadata(character_file_path, character_fields_to_extract)

In [55]:
#@title 2.3: Load conversation_data
def loadConversations(filePath, fields):
    """
    Args:
        filePath (str): full path to the conversations file to load
        fields (set<str>): fields to extract
    Return:
        list<dict<str>>: a list of dictionaries representing conversations
    """

    conversations = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            conversationObj = {}
            for i, field in enumerate(fields):
                conversationObj[field] = values[i]


            conversations[conversationObj['movieID']] = conversationObj

    return conversations

# Usage example
conversation_fields_to_extract = ['characterID1', 'characterID2', 'movieID', 'utteranceIDs']
conversation_file_path = dataset_dir + "/movie_conversations.txt"
conversation_data = loadConversations(conversation_file_path, conversation_fields_to_extract)

In [56]:
#@title 2.4: Load the title_data
def loadMovieTitlesMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the movie titles metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each movie title
    """
    movie_titles = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            movieTitleObj = {}
            for i, field in enumerate(fields):
                movieTitleObj[field] = values[i]

            movie_titles[movieTitleObj['movieID']] = movieTitleObj

    return movie_titles

# Usage example
movie_title_fields_to_extract = ['movieID', 'movieTitle', 'releaseYear', 'imdbRating', 'numVotes']
movie_title_file_path = dataset_dir + "/movie_titles_metadata.txt"
movie_title_data = loadMovieTitlesMetadata(movie_title_file_path, movie_title_fields_to_extract)

In [57]:
#@title 2.5: Load the url_data
def loadRawScriptUrls(filePath, fields):
    """
    Args:
        filePath (str): full path to the raw script URLs file to load
        fields (list<str>): fields to extract
    Return:
        dict<str, dict<str>>: a dictionary with movieID as keys and dictionaries with field values as values
    """
    urls = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            loadRawScriptUrls = {}
            for i, field in enumerate(fields):
                loadRawScriptUrls[field] = values[i]

            urls[loadRawScriptUrls['movieID']] = loadRawScriptUrls

    return urls

# Usage example
raw_script_urls_fields_to_extract = ['movieID', 'scriptURL', 'url']
raw_script_urls_file_path = dataset_dir + "/raw_script_urls.txt"
script_urls_data = loadRawScriptUrls(raw_script_urls_file_path, raw_script_urls_fields_to_extract)


In [58]:
#@title 2.6: Verify the loading of the data (replace array name and fields as needed)
# Print some sample movie title data to check if it was loaded
#for movie_id, movie_info in movie_title_data.items():
    #print(f"Movie ID: {movie_info['movieID']}")
    #print(f"Movie Title: {movie_info['movieTitle']}")
    #print(f"Release Year: {movie_info['releaseYear']}")
    #print(f"IMDB Rating: {movie_info['imdbRating']}")
    #print(f"Number of Votes: {movie_info['numVotes']}")
    #print("\n")

In [59]:
#@title 2.7: Verify the loading of the url data
#for movie_id, url_info in script_urls_data.items():
    #print(f"Movie ID: {url_info['movieID']}")
    #print(f"Movie Title: {url_info['scriptURL']}")
    #print(f"url: {url_info['url']}")
    #print("\n")

In [60]:
#@title 3: Convert dictionaries/arrays to DataFrames
df_lines = pd.DataFrame.from_dict(lines_data, orient='index')
df_characters = pd.DataFrame.from_dict(character_data, orient='index')
df_conversations = pd.DataFrame.from_dict(conversation_data, orient='index')
df_movie_titles = pd.DataFrame.from_dict(movie_title_data, orient='index')
df_script_urls =  pd.DataFrame.from_dict(script_urls_data, orient='index')


In [61]:
#@title 3.1: Verify df (change names as needed)
#df_script_urls.head()

In [62]:
#@title 4: Merge DataFrames based on Movie ID
merged_data = df_lines.merge(df_characters, on='movieID', how='inner')
merged_data = merged_data.merge(df_conversations, on='movieID', how='inner')
merged_data = merged_data.merge(df_movie_titles, on='movieID', how='inner')
merged_data = merged_data.merge(df_script_urls, on='movieID', how='inner')
#drop duplicate fields
merged_data = merged_data.drop("movieTitle_y", axis=1)
merged_data = merged_data.drop("scriptURL", axis=1)

In [63]:
#@title 4.1: Verify merged_data df
merged_data.head(20)

Unnamed: 0,lineID,characterID_x,movieID,character,text,characterID_y,characterName,movieTitle_x,gender,position,characterID1,characterID2,utteranceIDs,releaseYear,imdbRating,numVotes,url
0,L1045,u0,m0,BIANCA,They do not!\n,u0,BIANCA,10 things i hate about you,f,4\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
1,L1045,u0,m0,BIANCA,They do not!\n,u1,BRUCE,10 things i hate about you,?,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
2,L1045,u0,m0,BIANCA,They do not!\n,u2,CAMERON,10 things i hate about you,m,3\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
3,L1045,u0,m0,BIANCA,They do not!\n,u3,CHASTITY,10 things i hate about you,?,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
4,L1045,u0,m0,BIANCA,They do not!\n,u4,JOEY,10 things i hate about you,m,6\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
5,L1045,u0,m0,BIANCA,They do not!\n,u5,KAT,10 things i hate about you,f,2\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
6,L1045,u0,m0,BIANCA,They do not!\n,u6,MANDELLA,10 things i hate about you,f,7\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
7,L1045,u0,m0,BIANCA,They do not!\n,u7,MICHAEL,10 things i hate about you,m,5\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
8,L1045,u0,m0,BIANCA,They do not!\n,u8,MISS PERKY,10 things i hate about you,?,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...
9,L1045,u0,m0,BIANCA,They do not!\n,u9,PATRICK,10 things i hate about you,m,1\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,http://www.dailyscript.com/scripts/10Things.ht...


In [64]:
#@title 4.2: Verify the number of rows in the DataFrame
num_rows = merged_data.shape[0]

# Print the number of rows
print("Number of Rows:", num_rows)

Number of Rows: 4874548


In [65]:
#Building the Chatbot class: Include the building of recognizing the input words, this is done with NLP services. Then a query to match to the dataset and produce a response
import random
import json
import re
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
#from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class Chatbot:
    def __init__(self, data_frame, pre_trained_model_name):
        self.corpus = data_frame.to_dict(orient="records") #Convert data to Dictionaries

        #Load the tokenizer service
        self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name)
        #Load the NLP model
        self.model = AutoModelForSeq2SeqLM.from_pretrained(pre_trained_model_name)
        #self.model = AutoModelForCausalLM.from_pretrained(pre_trained_model_name)

        #Can use to view the entire loaded data
        print("Loaded movie data:")
        for movie in self.corpus[:5]:
            print(movie)

        print(f"Total movies loaded: {len(self.corpus)}")

    def generate_response(self, query, conversation, subject): # How to respond
        #response = None

        if subject == "releaseYear":
            response = f"The movie {conversation['movieTitle_x']} was released in {conversation['releaseYear']}."
        elif subject == "imdbRating":
            response = f"The IMDb rating of {conversation['movieTitle_x']} is {conversation['imdbRating']}."
        elif subject == "url":
            response = f"I don't have actor info but here is the URL for {conversation['movieTitle_x']}: {conversation['url']} for more info."

        else:
            response = "I can only answer questions about release year or IMDB rating."

        return response

    def identify_subject(self, query):# Identify the subject in the input text
        movie_keywords = ["hi","hello", "movie", "film", "imdb", "rated", "top_rated", "top_rating", "top", "lowest_rated", "lowest_rating", "lowest", "rating", "ratings", "release", "year", "votes", "actors","actor", "actress", "genres", "gender", "info", "information", "url", "website"]
        subject = None
        movie = None
        tokens = query.lower().split()

        for keyword in movie_keywords:
            if keyword in tokens:
                if keyword in ["movie", "film", "movies"]:
                    subject = "movieTitle_x"
                    movie_idx = tokens.index(keyword)
                    if movie_idx < len(tokens) - 1:
                        movie = tokens[movie_idx + 1].rstrip('?')
                elif keyword in ["imdb", "rating", "ratings"]:
                    subject = "imdbRating"
                elif keyword in ["top_rated", "top_rating", "top"]:
                    subject = "timdbRating"
                    movie = None
                elif keyword in ["lowest_rated", "lowest_rating", "lowest"]:
                    subject = "limdbRating"
                    movie = None
                elif keyword in ["release", "year"] and subject is None:
                    subject = "releaseYear"
                elif keyword in ["release", "year"] and subject is 'movieTitle_x':
                    subject = "releaseYear"
                elif keyword in ["actors","actor", "actress"]:
                    subject = "url"
                elif keyword in ["hi","hello"]:
                    subject = "hi"


        print(f"Identified subject: {subject}, movie: {movie}")
        return subject, movie


    def answer_question(self, query): #The section where the system matches the query
        subject, movie = self.identify_subject(query) #taken from the def identify_subject

        if subject and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

            if matching_conversations:
                conversation = matching_conversations[0]
                response = self.generate_response(query, conversation, subject)
                return response
            else:
                return "I couldn't find any information about that movie."

        elif subject == "releaseYear":
            year = re.search(r'\b\d{4}\b', query)
            if year:
                year = year.group(0)
                matching_movies = set()  # Use a set to store unique movie titles
                for conv in self.corpus:
                    if conv["releaseYear"] == year:
                        matching_movies.add(conv["movieTitle_x"])

                if matching_movies:
                    response = f"The following movies were released in {year}: {', '.join(matching_movies)}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "timdbRating":
            year_match = re.search(r'\b\d{4}\b', query)
            if year_match:
                year = year_match.group(0)
                movies_for_year = [conv for conv in self.corpus if isinstance(conv, dict) and conv.get("releaseYear") == year]
                if movies_for_year:
                    highest_rated_movie = max(movies_for_year, key=lambda x: float(x.get("imdbRating", 0)))
                    highest_rating = highest_rated_movie.get("imdbRating", 0)
                    movie_title = highest_rated_movie.get("movieTitle_x", "Unknown Movie")
                    response = f"The highest rated movie in {year} was '{movie_title}' with an IMDb rating of {highest_rating}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "limdbRating":
            year_match = re.search(r'\b\d{4}\b', query)
            if year_match:
                year = year_match.group(0)
                movies_for_year = [conv for conv in self.corpus if isinstance(conv, dict) and conv.get("releaseYear") == year]
                if movies_for_year:
                    lowest_rated_movie = min(movies_for_year, key=lambda x: float(x.get("imdbRating", 0)))
                    lowest_rating = lowest_rated_movie.get("imdbRating", 0)
                    movie_title = lowest_rated_movie.get("movieTitle_x", "Unknown Movie")
                    response = f"The lowest rated movie in {year} was '{movie_title}' with an IMDb rating of {lowest_rating}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "url" and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

            if matching_conversations:
                conversation = matching_conversations[0]
                if "url" in conversation:
                    response = f"I don't have actor info but here is the URL for {conversation['movieTitle_x']}: {conversation['url']} for more info."
                else:
                    response = f"Sorry, I don't have that info for {conversation['movieTitle_x']}."
            else:
                response = "I couldn't find any information about that movie."


        elif subject == "hi":
            response = f"Hello, how can I answer a movie question for you?"
            return response

        else:
            return "I can only answer movie-related questions."
            return "Please make sure to use the words like 'movie', 'year', 'released', 'rating' etc. I'm not great at guessing :("



  elif keyword in ["release", "year"] and subject is 'movieTitle_x':


In [66]:
#Main Chatbot Function-This will show 5 rows of the dataset and then the input and output chatbot
#Note: The words: movie, year, release, imdb must be present in the input for the system to understand
def main():
    pre_trained_model_name = "facebook/bart-large-cnn"
    chatbot = Chatbot(merged_data, pre_trained_model_name)

    while True:
        query = input("You: ")
        response = chatbot.answer_question(query)
        print("Chatbot:", response)

# if __name__ == "__main__":
#     main()

# Generate the Train dataset

### Example text data: 

```Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Give three tips for staying healthy. ### Response: 1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 2. Exercise regularly to keep your body active and strong. 3. Get enough sleep and maintain a consistent sleep schedule.```

In [67]:
# Initialize an empty train list
questions_list = []

### When was movie released?

In [68]:
template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
When was {} released?

### Response:
{} was released in {}
"""

for title, year in zip(df_movie_titles["movieTitle"], df_movie_titles["releaseYear"]):
    formatted_text = template.format(title, title, year)
    questions_list.append(formatted_text)

In [None]:
## Todo: Add additional training questions

In [69]:
# Store the csv for training
df_from_list = pd.DataFrame({'text': questions_list})
df_from_list.to_csv(train_dir + "train.csv", index=True)