#############################################################################################
## **AAI-520**                                                                                 #
## **Final Project - Group 6**                                                                 #
## **Retrieval-Based Chatbot for Movie Info utilizing the Cornell Movie Dialogs Corpus**        
#############################################################################################

In [1]:
#@title 1: Load the related Libraries
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import codecs
import csv
import os
import pandas as pd

##2: Load Movie Data Corpus

In [2]:
#@title 2.1: Load line_data
#Load the Line file

def loadLines(filePath, fields):
    """
    Args:
        filePath (str): full path to the file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each line
    """
    lines = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]

            lines[lineObj['lineID']] = lineObj

    return lines

# Usage example
fields_to_extract = ['lineID', 'characterID', 'movieID', 'character', 'text']
file_path = "/content/drive/MyDrive/AAI-520/Final/Data/movie_lines.txt"
lines_data = loadLines(file_path, fields_to_extract)

In [3]:
#@title 2.2: Load the charachter_data
def loadCharacterMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the character metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each character
    """
    characters = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            characterObj = {}
            for i, field in enumerate(fields):
                characterObj[field] = values[i]
                value = values[i]
                # Remove commas from the movieTitle field
                if field == 'movieTitle':
                    value = value.replace(',', '')  # Remove commas
                characterObj[field] = value

            characters[characterObj['characterID']] = characterObj

    return characters

# Usage example
character_fields_to_extract = ['characterID', 'characterName', 'movieID', 'movieTitle', 'gender', 'position']
character_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/movie_characters_metadata.txt"
character_data = loadCharacterMetadata(character_file_path, character_fields_to_extract)

In [4]:
#@title 2.3: Load conversation_data
def loadConversations(filePath, fields):
    """
    Args:
        filePath (str): full path to the conversations file to load
        fields (set<str>): fields to extract
    Return:
        list<dict<str>>: a list of dictionaries representing conversations
    """

    conversations = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            conversationObj = {}
            for i, field in enumerate(fields):
                conversationObj[field] = values[i]


            conversations[conversationObj['movieID']] = conversationObj

    return conversations

# Usage example
conversation_fields_to_extract = ['characterID1', 'characterID2', 'movieID', 'utteranceIDs']
conversation_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/movie_conversations.txt"
conversation_data = loadConversations(conversation_file_path, conversation_fields_to_extract)

In [5]:
#@title 2.4: Load the title_data
def loadMovieTitlesMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the movie titles metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each movie title
    """
    movie_titles = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            movieTitleObj = {}
            for i, field in enumerate(fields):
                movieTitleObj[field] = values[i]

            movie_titles[movieTitleObj['movieID']] = movieTitleObj

    return movie_titles

# Usage example
movie_title_fields_to_extract = ['movieID', 'movieTitle', 'releaseYear', 'imdbRating', 'numVotes', 'genres']
movie_title_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/movie_titles_metadata.txt"
movie_title_data = loadMovieTitlesMetadata(movie_title_file_path, movie_title_fields_to_extract)

In [None]:
def loadMovieTitlesMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the movie titles metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each movie title
    """
    movie_titles = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            movieTitleObj = {}
            for i, field in enumerate(fields):
                value = values[i]
                # Remove commas from the movieTitle field
                if field == 'movieTitle':
                    value = value.replace(',', '')  # Remove commas
                movieTitleObj[field] = value

            movie_titles[movieTitleObj['movieID']] = movieTitleObj

    return movie_titles

# Usage example
movie_title_fields_to_extract = ['movieID', 'movieTitle', 'releaseYear', 'imdbRating', 'numVotes', 'genres']
movie_title_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/movie_titles_metadata.txt"
movie_title_data = loadMovieTitlesMetadata(movie_title_file_path, movie_title_fields_to_extract)


In [6]:
#@title 2.5: Load the url_data
def loadRawScriptUrls(filePath, fields):
    """
    Args:
        filePath (str): full path to the raw script URLs file to load
        fields (list<str>): fields to extract
    Return:
        dict<str, dict<str>>: a dictionary with movieID as keys and dictionaries with field values as values
    """
    urls = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            loadRawScriptUrls = {}
            for i, field in enumerate(fields):
                loadRawScriptUrls[field] = values[i]
                value = values[i]
                # Remove commas from the movieTitle field
                if field == 'url':
                    value = value.replace(',', '')  # Remove commas
                loadRawScriptUrls[field] = value
                # Remove commas from the movieTitle field
                if field == 'scriptURL':
                    value = value.replace(',', '')  # Remove commas
                loadRawScriptUrls[field] = value

            urls[loadRawScriptUrls['movieID']] = loadRawScriptUrls

    return urls

# Usage example
raw_script_urls_fields_to_extract = ['movieID', 'scriptURL', 'url']
raw_script_urls_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/raw_script_urls.txt"
script_urls_data = loadRawScriptUrls(raw_script_urls_file_path, raw_script_urls_fields_to_extract)


In [7]:
#@title 3: Convert dictionaries/arrays to DataFrames
df_lines = pd.DataFrame.from_dict(lines_data, orient='index')
df_characters = pd.DataFrame.from_dict(character_data, orient='index')
df_conversations = pd.DataFrame.from_dict(conversation_data, orient='index')
df_movie_titles = pd.DataFrame.from_dict(movie_title_data, orient='index')
df_script_urls =  pd.DataFrame.from_dict(script_urls_data, orient='index')


In [9]:
#@title 3.1: Split the genres and create separate columns
# Remove square brackets and single quotes and split the genres
df_movie_titles['genres'] = df_movie_titles['genres'].str.replace(r"\[|\]|'", '').str.split(', ')


max_genres = df_movie_titles['genres'].apply(len).max()  # Find the maximum number of genres in any row
for i in range(1, max_genres + 1):
    df_movie_titles[f'genre_{i}'] = df_movie_titles['genres'].apply(lambda x: x[i - 1] if len(x) >= i else None)

# Drop the original "genres" column
df_movie_titles = df_movie_titles.drop(columns=['genres'])


  df_movie_titles['genres'] = df_movie_titles['genres'].str.replace(r"\[|\]|'", '').str.split(', ')


In [10]:
#@title 3.2: Verify the new fields
df_movie_titles.head()

Unnamed: 0,movieID,movieTitle,releaseYear,imdbRating,numVotes,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11
m0,m0,10 things i hate about you,1999,6.9,62847,comedy,romance\n,,,,,,,,,
m1,m1,1492: conquest of paradise,1992,6.2,10421,adventure,biography,drama,history\n,,,,,,,
m2,m2,15 minutes,2001,6.1,25854,action,crime,drama,thriller\n,,,,,,,
m3,m3,2001: a space odyssey,1968,8.4,163227,adventure,mystery,sci-fi\n,,,,,,,,
m4,m4,48 hrs.,1982,6.9,22289,action,comedy,crime,drama,thriller\n,,,,,,


In [11]:
#@title 4: Merge DataFrames based on Movie ID
merged_data = df_lines.merge(df_characters, on='movieID', how='inner')
merged_data = merged_data.merge(df_conversations, on='movieID', how='inner')
merged_data = merged_data.merge(df_movie_titles, on='movieID', how='inner')
merged_data = merged_data.merge(df_script_urls, on='movieID', how='inner')
#drop duplicate fields
merged_data = merged_data.drop("movieTitle_y", axis=1)
merged_data = merged_data.drop("scriptURL", axis=1)

In [21]:
#@title 4.1: Verify merged_data df
merged_data.head()

Unnamed: 0,lineID,characterID_x,movieID,character,text,characterID_y,characterName,movieTitle_x,gender,position,...,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,url
0,L1045,u0,m0,BIANCA,They do not!\n,u0,BIANCA,10 things i hate about you,f,4\n,...,,,,,,,,,,http://www.dailyscript.com/scripts/10Things.ht...
1,L1045,u0,m0,BIANCA,They do not!\n,u1,BRUCE,10 things i hate about you,?,?\n,...,,,,,,,,,,http://www.dailyscript.com/scripts/10Things.ht...
2,L1045,u0,m0,BIANCA,They do not!\n,u2,CAMERON,10 things i hate about you,m,3\n,...,,,,,,,,,,http://www.dailyscript.com/scripts/10Things.ht...
3,L1045,u0,m0,BIANCA,They do not!\n,u3,CHASTITY,10 things i hate about you,?,?\n,...,,,,,,,,,,http://www.dailyscript.com/scripts/10Things.ht...
4,L1045,u0,m0,BIANCA,They do not!\n,u4,JOEY,10 things i hate about you,m,6\n,...,,,,,,,,,,http://www.dailyscript.com/scripts/10Things.ht...


In [13]:
#@title 4.2: Verify the number of rows in the DataFrame
num_rows = merged_data.shape[0]

# Print the number of rows
print("Number of Rows:", num_rows)

Number of Rows: 4874548


In [14]:
#@title 5: Prepare the work for the Chatbot
#Load transformer for chatbot operations
!pip3 install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
Col

In [19]:
#@title 5.1: Building the Chatbot class: Include the building of recognizing the input words, this is done with NLP services. Then a query to match to the dataset and produce a response
#Load related libraries
import random
import json
import re
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
#from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Build the Retrieval-Based chatbot. Define the If and Else for query of data
class Chatbot:
    def __init__(self, data_frame, pre_trained_model_name):
        self.corpus = data_frame.to_dict(orient="records") #Convert data to Dictionaries

        #Load the tokenizer service
        self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name)
        #Load the NLP model
        self.model = AutoModelForSeq2SeqLM.from_pretrained(pre_trained_model_name)
        #self.model = AutoModelForCausalLM.from_pretrained(pre_trained_model_name)

        #Can use to view the entire loaded data
        print("Loaded movie data:")
        for movie in self.corpus[:5]:
            print(movie)

        print(f"Total movies loaded: {len(self.corpus)}")

    def generate_response(self, query, conversation, subject): # How to respond
        #response = None

        if subject == "releaseYear":
            response = f"The movie {conversation['movieTitle_x']} was released in {conversation['releaseYear']}."
        elif subject == "imdbRating":
            response = f"The IMDb rating of {conversation['movieTitle_x']} is {conversation['imdbRating']}."
        elif subject == "genre_1":
            response = f"The genres of {conversation['movieTitle_x']} is {conversation['genre_1']}."
        elif subject == "url":
            response = f"I don't have actor info but here is the URL for {conversation['movieTitle_x']}: {conversation['url']} for more info."

        else:
            response = "I can only answer questions about release year or IMDB rating."

        return response

    def identify_subject(self, query):# Identify the subject in the input text
        movie_keywords = ["hi","hello", "help", "genre", "genres", "movie", "film", "imdb", "rated", "top_rated", "top_rating", "top", "lowest_rated", "lowest_rating", "lowest", "rating", "ratings", "release", "year", "votes", "actors","actor", "actress", "genres", "gender", "info", "information", "url", "website"]
        subject = None
        movie = None
        tokens = query.lower().split()

        for keyword in movie_keywords:
            if keyword in tokens:
                if keyword in ["genres", "genre"]:
                    subject = "genre_1"
                elif keyword in ["movie", "film", "movies"]:
                    subject = "movieTitle_x"
                    movie_idx = tokens.index(keyword)
                    if movie_idx < len(tokens) - 1:
                        movie = tokens[movie_idx + 1].rstrip('?')
                elif keyword in ["imdb", "rating", "ratings"]:
                    subject = "imdbRating"
                elif keyword in ["top_rated", "top_rating", "top"]:
                    subject = "timdbRating"
                    movie = None
                elif keyword in ["lowest_rated", "lowest_rating", "lowest"]:
                    subject = "limdbRating"
                    movie = None
                elif keyword in ["release", "year"] and subject is None:
                    subject = "releaseYear"
                elif keyword in ["release", "year"] and subject is 'movieTitle_x':
                    subject = "releaseYear"
                elif keyword in ["actors","actor", "actress", "script", "information"]:
                    subject = "url"
                elif keyword in ["hi","hello"]:
                    subject = "hi"
                elif keyword in ["help"]:
                    subject = "help"

        #print(f"Identified subject: {subject}, movie: {movie}")
        print('Please make sure to use the key words to your question, such as, movie, year, release, script, etc. ' )
        return subject, movie


    def answer_question(self, query): #The section where the system matches the query
        subject, movie = self.identify_subject(query) #taken from the def identify_subject

        if subject and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

            if matching_conversations:
                conversation = matching_conversations[0]
                response = self.generate_response(query, conversation, subject)
                return response
            else:
                return "I couldn't find any information about that movie."

        elif subject == "releaseYear":
            year = re.search(r'\b\d{4}\b', query)
            if year:
                year = year.group(0)
                matching_movies = set()  # Use a set to store unique movie titles
                for conv in self.corpus:
                    if conv["releaseYear"] == year:
                        matching_movies.add(conv["movieTitle_x"])

                if matching_movies:
                    response = f"The following movies were released in {year}: {', '.join(matching_movies)}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "timdbRating":
            year_match = re.search(r'\b\d{4}\b', query)
            if year_match:
                year = year_match.group(0)
                movies_for_year = [conv for conv in self.corpus if isinstance(conv, dict) and conv.get("releaseYear") == year]
                if movies_for_year:
                    highest_rated_movie = max(movies_for_year, key=lambda x: float(x.get("imdbRating", 0)))
                    highest_rating = highest_rated_movie.get("imdbRating", 0)
                    movie_title = highest_rated_movie.get("movieTitle_x", "Unknown Movie")
                    response = f"The highest rated movie in {year} was '{movie_title}' with an IMDb rating of {highest_rating}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "limdbRating":
            year_match = re.search(r'\b\d{4}\b', query)
            if year_match:
                year = year_match.group(0)
                movies_for_year = [conv for conv in self.corpus if isinstance(conv, dict) and conv.get("releaseYear") == year]
                if movies_for_year:
                    lowest_rated_movie = min(movies_for_year, key=lambda x: float(x.get("imdbRating", 0)))
                    lowest_rating = lowest_rated_movie.get("imdbRating", 0)
                    movie_title = lowest_rated_movie.get("movieTitle_x", "Unknown Movie")
                    response = f"The lowest rated movie in {year} was '{movie_title}' with an IMDb rating of {lowest_rating}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "url" and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

        elif subject == "genre_1" and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

            if matching_conversations:
                conversation = matching_conversations[0]
                if "url" in conversation:
                    response = f"I don't have actor info but here is the URL for {conversation['movieTitle_x']}: {conversation['url']} for more info."
                else:
                    response = f"Sorry, I don't have that info for {conversation['movieTitle_x']}."
            else:
                response = "I couldn't find any information about that movie."

        elif subject == "hi":
            response = f"Hello, how can I answer a movie question for you?"
            return response

        elif subject == "help":
            response = f"I can answer question regarding a movie name, the year it was released, the rating score, the character names, and some of their lines. I do not have the actor names, but I can provide a website with more info on the movie."
            return response

        else:
            return "I can only answer movie-related questions."
            return "Please make sure to use the words like 'movie', 'year', 'released', 'rating' etc. I'm not great at guessing :("




  elif keyword in ["release", "year"] and subject is 'movieTitle_x':


In [20]:
#@title 6: Main Chatbot Function-This will show 5 rows of the dataset and then the input and output chatbot
#Note: The words: movie, year, release, imdb must be present in the input for the system to understand
def main():
    pre_trained_model_name = "facebook/bart-large-cnn"
    chatbot = Chatbot(merged_data, pre_trained_model_name)

    while True:
        query = input("You: ")
        response = chatbot.answer_question(query)
        print("Chatbot:", response)

if __name__ == "__main__":
    main()

Loaded movie data:
{'lineID': 'L1045', 'characterID_x': 'u0', 'movieID': 'm0', 'character': 'BIANCA', 'text': 'They do not!\n', 'characterID_y': 'u0', 'characterName': 'BIANCA', 'movieTitle_x': '10 things i hate about you', 'gender': 'f', 'position': '4\n', 'characterID1': 'u10', 'characterID2': 'u11', 'utteranceIDs': "['L929', 'L930', 'L931', 'L932', 'L933']\n", 'releaseYear': '1999', 'imdbRating': '6.90', 'numVotes': '62847', 'genre_1': 'comedy', 'genre_2': 'romance\n', 'genre_3': None, 'genre_4': None, 'genre_5': None, 'genre_6': None, 'genre_7': None, 'genre_8': None, 'genre_9': None, 'genre_10': None, 'genre_11': None, 'url': 'http://www.dailyscript.com/scripts/10Things.html\n'}
{'lineID': 'L1045', 'characterID_x': 'u0', 'movieID': 'm0', 'character': 'BIANCA', 'text': 'They do not!\n', 'characterID_y': 'u1', 'characterName': 'BRUCE', 'movieTitle_x': '10 things i hate about you', 'gender': '?', 'position': '?\n', 'characterID1': 'u10', 'characterID2': 'u11', 'utteranceIDs': "['L929

KeyboardInterrupt: ignored

#Testing Section

In [None]:
import json
import re

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

class Chatbot:
    def __init__(self, data_frame, pre_trained_model_name):
        self.corpus = data_frame.to_dict(orient="records")  # Convert data to Dictionaries

        # Load the tokenizer service
        self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name)
        # Load the NLP model
        self.model = AutoModelForSeq2SeqLM.from_pretrained(pre_trained_model_name)

        # Can use to view the entire loaded data
        print("Loaded movie data:")
        for movie in self.corpus[:5]:
            print(movie)

        print(f"Total movies loaded: {len(self.corpus)}")

    # Your other methods...
    def generate_response(self, query, conversation, subject): # How to respond
        #response = None

        if subject == "releaseYear":
            response = f"The movie {conversation['movieTitle_x']} was released in {conversation['releaseYear']}."
        elif subject == "imdbRating":
            response = f"The IMDb rating of {conversation['movieTitle_x']} is {conversation['imdbRating']}."
        elif subject == "genre_1":
            response = f"The genres of {conversation['movieTitle_x']} is {conversation['genre_1']}."
        elif subject == "url":
            response = f"I don't have actor info but here is the URL for {conversation['movieTitle_x']}: {conversation['url']} for more info."

        else:
            response = "I can only answer questions about release year or IMDB rating."

        return response

    def identify_subject(self, query):# Identify the subject in the input text
        movie_keywords = ["hi","hello", "help", "genre", "genres", "movie", "film", "imdb", "rated", "top_rated", "top_rating", "top", "lowest_rated", "lowest_rating", "lowest", "rating", "ratings", "release", "year", "votes", "actors","actor", "actress", "genres", "gender", "info", "information", "url", "website"]
        subject = None
        movie = None
        tokens = query.lower().split()

        for keyword in movie_keywords:
            if keyword in tokens:
                if keyword in ["genres", "genre"]:
                    subject = "genre_1"
                elif keyword in ["movie", "film", "movies"]:
                    subject = "movieTitle_x"
                    movie_idx = tokens.index(keyword)
                    if movie_idx < len(tokens) - 1:
                        movie = tokens[movie_idx + 1].rstrip('?')
                elif keyword in ["imdb", "rating", "ratings"]:
                    subject = "imdbRating"
                elif keyword in ["top_rated", "top_rating", "top"]:
                    subject = "timdbRating"
                    movie = None
                elif keyword in ["lowest_rated", "lowest_rating", "lowest"]:
                    subject = "limdbRating"
                    movie = None
                elif keyword in ["release", "year"] and subject is None:
                    subject = "releaseYear"
                elif keyword in ["release", "year"] and subject is 'movieTitle_x':
                    subject = "releaseYear"
                elif keyword in ["actors","actor", "actress"]:
                    subject = "url"
                elif keyword in ["hi","hello"]:
                    subject = "hi"
                elif keyword in ["help"]:
                    subject = "help"

        print(f"Identified subject: {subject}, movie: {movie}")
        return subject, movie


    def answer_question(self, query): #The section where the system matches the query
        subject, movie = self.identify_subject(query) #taken from the def identify_subject

        if subject and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

            if matching_conversations:
                conversation = matching_conversations[0]
                response = self.generate_response(query, conversation, subject)
                return response
            else:
                return "I couldn't find any information about that movie."

        elif subject == "releaseYear":
            year = re.search(r'\b\d{4}\b', query)
            if year:
                year = year.group(0)
                matching_movies = set()  # Use a set to store unique movie titles
                for conv in self.corpus:
                    if conv["releaseYear"] == year:
                        matching_movies.add(conv["movieTitle_x"])

                if matching_movies:
                    response = f"The following movies were released in {year}: {', '.join(matching_movies)}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "timdbRating":
            year_match = re.search(r'\b\d{4}\b', query)
            if year_match:
                year = year_match.group(0)
                movies_for_year = [conv for conv in self.corpus if isinstance(conv, dict) and conv.get("releaseYear") == year]
                if movies_for_year:
                    highest_rated_movie = max(movies_for_year, key=lambda x: float(x.get("imdbRating", 0)))
                    highest_rating = highest_rated_movie.get("imdbRating", 0)
                    movie_title = highest_rated_movie.get("movieTitle_x", "Unknown Movie")
                    response = f"The highest rated movie in {year} was '{movie_title}' with an IMDb rating of {highest_rating}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "limdbRating":
            year_match = re.search(r'\b\d{4}\b', query)
            if year_match:
                year = year_match.group(0)
                movies_for_year = [conv for conv in self.corpus if isinstance(conv, dict) and conv.get("releaseYear") == year]
                if movies_for_year:
                    lowest_rated_movie = min(movies_for_year, key=lambda x: float(x.get("imdbRating", 0)))
                    lowest_rating = lowest_rated_movie.get("imdbRating", 0)
                    movie_title = lowest_rated_movie.get("movieTitle_x", "Unknown Movie")
                    response = f"The lowest rated movie in {year} was '{movie_title}' with an IMDb rating of {lowest_rating}."
                else:
                    response = f"No movies were released in {year}."
                return response

        elif subject == "url" and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

        elif subject == "genre_1" and movie:
            matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and movie.lower() in conv.get("movieTitle_x", "").lower()]

            if matching_conversations:
                conversation = matching_conversations[0]
                if "url" in conversation:
                    response = f"I don't have actor info but here is the URL for {conversation['movieTitle_x']}: {conversation['url']} for more info."
                else:
                    response = f"Sorry, I don't have that info for {conversation['movieTitle_x']}."
            else:
                response = "I couldn't find any information about that movie."

        elif subject == "hi":
            response = f"Hello, how can I answer a movie question for you?"
            return response

        elif subject == "help":
            response = f"I can answer question regarding a movie name, the year it was released, the rating score, the character names, and some of their lines. I do not have the actor names, but I can provide a website with more info on the movie."
            return response

        else:
            return "I can only answer movie-related questions."
            return "Please make sure to use the words like 'movie', 'year', 'released', 'rating' etc. I'm not great at guessing :("

    def fill_placeholders(self, text, conversation):
        # Replace placeholders with values from the conversation
        for key, value in conversation.items():
            text = text.replace(f"<{key}>", str(value))
        return text

    def generate_dataset(self, input_file, output_file):
        # Open the input and output files
        with open(input_file, "r") as input_file, open(output_file, "w") as output_file:
            for line in input_file:
                query_template = line.strip()
                conversation = {}  # Initialize conversation as an empty dictionary

                # Identify the subject and movie from the query
                subject, movie = self.identify_subject(query_template)

                if subject and movie:
                    matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and
                                              movie.lower() in conv.get("movieTitle_x", "").lower()]

                    if matching_conversations:
                        conversation = matching_conversations[0]
                        response = self.generate_response(query_template, conversation, subject)
                        print(response[:5])  # Print the first 5 characters of the response
                    else:
                        response = "I couldn't find any information about that movie."
                else:
                    response = "I couldn't find any information about that movie."

                # Replace placeholders with actual values
                input_text = self.fill_placeholders(query_template, conversation)
                target_text = self.fill_placeholders(response, conversation)

                # Write the formatted entry to the output file
                output_file.write(f"Input Text: {input_text}\n")
                output_file.write(f"Target Text: {target_text}\n\n")






def main():
    pre_trained_model_name = "facebook/bart-large-cnn"
    chatbot = Chatbot(merged_data, pre_trained_model_name)

    # Specify your input questions file and output dataset file
    input_file = "/content/drive/MyDrive/AAI-520/Final/Data/questions.txt"
    output_file = "/content/drive/MyDrive/AAI-520/Final/Data/output_dataset.json"

    # Generate the dataset
    chatbot.generate_dataset(input_file, output_file)

if __name__ == "__main__":
    main()



  elif keyword in ["release", "year"] and subject is 'movieTitle_x':


Loaded movie data:
{'lineID': 'L1045', 'characterID_x': 'u0', 'movieID': 'm0', 'character': 'BIANCA', 'text': 'They do not!\n', 'characterID_y': 'u0', 'characterName': 'BIANCA', 'movieTitle_x': '10 things i hate about you', 'gender': 'f', 'position': '4\n', 'characterID1': 'u10', 'characterID2': 'u11', 'utteranceIDs': "['L929', 'L930', 'L931', 'L932', 'L933']\n", 'releaseYear': '1999', 'imdbRating': '6.90', 'numVotes': '62847', 'genre_1': 'comedy', 'genre_2': 'romance\n', 'genre_3': None, 'genre_4': None, 'genre_5': None, 'genre_6': None, 'genre_7': None, 'genre_8': None, 'genre_9': None, 'genre_10': None, 'genre_11': None, 'url': 'http://www.dailyscript.com/scripts/10Things.html\n'}
{'lineID': 'L1045', 'characterID_x': 'u0', 'movieID': 'm0', 'character': 'BIANCA', 'text': 'They do not!\n', 'characterID_y': 'u1', 'characterName': 'BRUCE', 'movieTitle_x': '10 things i hate about you', 'gender': '?', 'position': '?\n', 'characterID1': 'u10', 'characterID2': 'u11', 'utteranceIDs': "['L929

In [None]:
import json

# Specify the path to the output file
output_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/output_dataset.json"

# Read and print the contents of the output file
with open(output_file_path, "r") as output_file:
    contents = output_file.read()
    print(contents)


Input Text: what year was the movie <movieTitle_x> released
Target Text: I couldn't find any information about that movie.

Input Text: what was the rating on the movie <movieTitle_x>
Target Text: I couldn't find any information about that movie.

Input Text: what genres is the movie <movieTitle_x>
Target Text: I couldn't find any information about that movie.

Input Text: what movies were released in the year <releaseYear>
Target Text: I couldn't find any information about that movie.

Input Text: what were the top rated movies for the year <releaseYear>
Target Text: I couldn't find any information about that movie.

Input Text: what were the lowest rated movies for the year <releaseYear>
Target Text: I couldn't find any information about that movie.




In [None]:
def generate_dataset(self, input_file, output_file):
        # Open the input and output files
        with open(input_file, "r") as input_file, open(output_file, "w") as output_file:
            for line in input_file:
                query_template = line.strip()
                conversation = {}  # Initialize conversation as an empty dictionary

                # Identify the subject and movie from the query
                subject, movie = self.identify_subject(query_template)

                if subject and movie:
                    matching_conversations = [conv for conv in self.corpus if isinstance(conv, dict) and
                                              movie.lower() in conv.get("movieTitle_x", "").lower()]

                    if matching_conversations:
                        conversation = matching_conversations[0]
                        response = self.generate_response(query_template, conversation, subject)
                        print(response[:5])  # Print the first 5 characters of the response
                    else:
                        response = "I couldn't find any information about that movie."
                else:
                    response = "I couldn't find any information about that movie."

                # Replace placeholders with actual values
                input_text = self.fill_placeholders(query_template, conversation)
                target_text = self.fill_placeholders(response, conversation)

                # Write the formatted entry to the output file
                output_file.write(f"Input Text: {input_text}\n")
                output_file.write(f"Target Text: {target_text}\n\n")

        def fill_placeholders(self, text, conversation):
            # Replace placeholders with values from the conversation
            for key, value in conversation.items():
                text = text.replace(f"<{key}>", str(value))
            return text

In [None]:
generate_dataset

<function __main__.generate_dataset(self, input_file, output_file)>

In [None]:
#New test

In [None]:

# Specify the full path to the input text file
input_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/ImdbRt.txt"

# Load the input text file
with open(input_file_path, "r") as input_file:
    input_lines = input_file.read().splitlines()

# Initialize an empty list to store the output data
output_data = []

# Loop through the rows of the DataFrame
for index, row in df_movie_titles.iterrows():
    # Replace placeholders with actual values and create separate JSON objects
    input_text = {"input_text": input_lines[0].replace("<movieTitle>", row['movieTitle'].capitalize())}
    target_text = {"target_text": input_lines[1].replace("<movieTitle>", row['movieTitle'].capitalize()).replace("<imdbRating>", str(row['imdbRating']))}

    #input_text = {"input_text": input_lines[0].replace("<movieTitle>", row['movieTitle'].capitalize()).replace("<characterName>", str(row['characterName']))}
    #target_text = {"target_text": input_lines[1].replace("<characterName>", row['characterName'].capitalize()).replace("<gender>", str(row['gender']))}

    # Append the input_text and target_text to the output data list
    output_data.append(input_text)
    output_data.append(target_text)

# Specify the full path to the output JSON file
output_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/ImdbRt.json"

# Write the output data to the JSON file
with open(output_file_path, "w") as output_file:
    json.dump(output_data, output_file, indent=4)

print("Output JSON file has been created.")


Output JSON file has been created.


In [None]:
#Paul section

In [None]:
# Initialize an empty train list
questions_list = []

In [None]:
# This is for: When was the movie {} released?
template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
When was the movie {} released?

### Response:
The movie {} was released in {}
"""

for title, year in zip(df_movie_titles["movieTitle"], df_movie_titles["releaseYear"]):
    formatted_text = template.format(title, title, year)
    questions_list.append(formatted_text)

In [None]:
# What year was the movie {} released in?
template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
What year was the movie {} released in?

### Response:
The movie {} was released in the year {}
"""

for title, year in zip(df_movie_titles["movieTitle"], df_movie_titles["releaseYear"]):
    formatted_text = template.format(title, title, year)
    questions_list.append(formatted_text)

In [None]:
# what is the rating on the movie {}?

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
what is the rating on the movie {}?

### Response:
The rating on {} is {}
"""

for title, imdb in zip(df_movie_titles["movieTitle"], df_movie_titles["imdbRating"]):
    formatted_text = template.format(title, title, imdb)
    questions_list.append(formatted_text)

In [None]:
# what is the genres on the movie {}?

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
what is the genres on the movie {}?

### Response:
The genres on {} is {}
"""

for title, genres in zip(df_movie_titles["movieTitle"], df_movie_titles["genre_1"]):
    formatted_text = template.format(title, title, genres)
    questions_list.append(formatted_text)

# Second one if exist
for title, genres in zip(df_movie_titles["movieTitle"], df_movie_titles["genre_2"]):
    formatted_text = template.format(title, title, genres)
    questions_list.append(formatted_text)

In [None]:
# how many votes did the movie {} receive?

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
how many votes did the movie {} receive?

### Response:
{} received {} votes
"""

for title, votes in zip(df_movie_titles["movieTitle"], df_movie_titles["numVotes"]):
    formatted_text = template.format(title, title, votes)
    questions_list.append(formatted_text)

In [None]:
# who are the characters in the movie {}?

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
who are the characters in the movie {}?

### Response:
The character in {} is {}
"""

for title, character in zip(df_characters["movieTitle"], df_characters["characterName"]):
    formatted_text = template.format(title, title, character)
    questions_list.append(formatted_text)

In [None]:
# what gender is the character {} in the movie {}?

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
what gender is the character {} in the movie {}?

### Response:
{}'s gender is {}
"""

for character, title, gender in zip(df_characters["characterName"], df_characters["movieTitle"], df_characters["gender"]):
    formatted_text = template.format(character, title, character, gender)
    questions_list.append(formatted_text)

In [None]:
# do you have the full script for the movie {}?

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
do you have the full script for the movie {}?

### Response:
sure you can find it here {}
"""

for title, url in zip(df_script_urls["scriptURL"], df_script_urls["url"]):
    formatted_text = template.format(title, url)
    questions_list.append(formatted_text)

In [None]:
# what are some line that {} said in the movie {}?

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.
### Instruction:
what are some line that {} said in the movie {}?

### Response:
{} said {}
"""

for character, title, text in zip(df_lines["character"], df_movie_titles["movieTitle"], df_lines["text"]):
    formatted_text = template.format(character, title, character, text)
    questions_list.append(formatted_text)

In [None]:
# Store the csv for training
output_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/train_dir.json"
df_from_list = pd.DataFrame({'text': questions_list})
df_from_list.to_csv(output_file_path + "train.csv", index=True)

#Pre Trained test - Adam

In [None]:
import torch

print(torch.__version__)


2.0.1+cu118


In [None]:
!pip install datasets




In [None]:
#Creates a training file with only a header text, response

# Initialize an empty train list
questions_list = []

# Add headers to the list
questions_list.append("text;response")

# This is for: When was the movie {} released?
template = """When was the movie {} released?; The movie {} was released in {}"""

for title, year in zip(df_movie_titles["movieTitle"], df_movie_titles["releaseYear"]):
    formatted_text = template.format(title, title, year)
    questions_list.append(formatted_text)

# What year was the movie {} released in?
template = """What year was the movie {} released in?; {} was released in the year {}"""

for title, year in zip(df_movie_titles["movieTitle"], df_movie_titles["releaseYear"]):
    formatted_text = template.format(title, title, year)
    questions_list.append(formatted_text)

# what is the rating on the movie {}?

template = """
what is the rating on the movie {}?; The rating on {} is {}
"""

for title, imdb in zip(df_movie_titles["movieTitle"], df_movie_titles["imdbRating"]):
    formatted_text = template.format(title, title, imdb)
    questions_list.append(formatted_text)

# what is the genres on the movie {}?

template = """
what is the genres on the movie {}?; The genres on {} is {}
"""

for title, genres in zip(df_movie_titles["movieTitle"], df_movie_titles["genre_1"]):
    formatted_text = template.format(title, title, genres)
    questions_list.append(formatted_text)

# Second one if exist
for title, genres in zip(df_movie_titles["movieTitle"], df_movie_titles["genre_2"]):
    formatted_text = template.format(title, title, genres)
    questions_list.append(formatted_text)

# how many votes did the movie {} receive?

template = """
how many votes did the movie {} receive?; {} received {} votes
"""

for title, votes in zip(df_movie_titles["movieTitle"], df_movie_titles["numVotes"]):
    formatted_text = template.format(title, title, votes)
    questions_list.append(formatted_text)

# who are the characters in the movie {}?

template = """
who are the characters in the movie {}?; The character in {} is {}
"""

for title, character in zip(df_characters["movieTitle"], df_characters["characterName"]):
    formatted_text = template.format(title, title, character)
    questions_list.append(formatted_text)

# what gender is the character {} in the movie {}?

template = """
what gender is the character {} in the movie {}?; {}'s gender is {}
"""

for character, title, gender in zip(df_characters["characterName"], df_characters["movieTitle"], df_characters["gender"]):
    formatted_text = template.format(character, title, character, gender)
    questions_list.append(formatted_text)

# do you have the full script for the movie {}?

template = """
do you have the full script for the movie {}?; sure you can find it here {}
"""

for title, url in zip(df_script_urls["scriptURL"], df_script_urls["url"]):
    formatted_text = template.format(title, url)
    questions_list.append(formatted_text)

# what are some line that {} said in the movie {}?

template = """
what are some line that {} said in the movie {}?; {} said {}
"""

for character, title, text in zip(df_lines["character"], df_movie_titles["movieTitle"], df_lines["text"]):
    formatted_text = template.format(character, title, character, text)
    questions_list.append(formatted_text)

# Store the csv for training
output_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/train_dir2.jsontrain.csv"
with open(output_file_path, 'w') as file:
    file.write('\n'.join(questions_list))

In [None]:
!pip install accelerate>=0.20.1


In [None]:
def preprocess_data(example):
    text = example["text"]
    response = example["response"]

    # Ensure that both text and response are non-empty strings
    if isinstance(text, str) and isinstance(response, str) and text.strip() and response.strip():
        return {
            "input_ids": tokenizer(text, return_tensors="pt").input_ids,
            "attention_mask": tokenizer(text, return_tensors="pt").attention_mask,
            "labels": tokenizer(response, return_tensors="pt").input_ids,
        }
    else:
        # Return None for entries with missing or empty text/response
        return None

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from datasets import Dataset

# Specify the path to your CSV file with semicolon delimiter
input_file_path = "/content/drive/MyDrive/AAI-520/Final/Data/train_dir2.jsontrain.csv"

# Load your training data from the CSV file using Pandas
train_data = pd.read_csv(input_file_path, delimiter=';')

# Initialize the BART model and tokenizer
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Define a function to preprocess and tokenize the data
def preprocess_data(example):
    text = example["text"]
    response = example["response"]

    # Ensure that both text and response are non-empty strings
    if isinstance(text, str) and isinstance(response, str) and text.strip() and response.strip():
        return tokenizer(
            text,
            response,
            padding="max_length",
            max_length=256,
            return_tensors="pt",
            truncation=True,
        )
    else:
        # Return None for entries with missing or empty text/response
        return None

# Apply preprocessing and filter out None values
tokenized_data = [preprocess_data(example) for _, example in train_data.iterrows()]
tokenized_data = [data for data in tokenized_data if data is not None]

# Convert the list of tokenized data into a dictionary
train_dataset_dict = {
    "input_ids": [data["input_ids"] for data in tokenized_data],
    "attention_mask": [data["attention_mask"] for data in tokenized_data],
    "labels": [data["input_ids"] for data in tokenized_data],  # Use input_ids as labels for Seq2Seq
}

# Create Dataset
train_dataset = Dataset.from_dict(train_dataset_dict)

# Create DataLoader
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
train_data_loader = DataLoader(train_dataset, collate_fn=data_collator, batch_size=8)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./fine-tuned_model',
    per_device_train_batch_size=8,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3,
)

# Create a trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Custom training loop
for epoch in range(training_args.num_train_epochs):
    for batch in train_data_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target_ids = batch['labels']

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=target_ids, return_dict=True)

        # Check if 'loss' and 'logits' keys are present in outputs
        if 'loss' in outputs:
            loss = outputs.loss  # Get the loss value
            # Use the loss for backpropagation and optimization
            loss.backward()

            # Backpropagation and optimization
            trainer.optimizer.step()
            trainer.lr_scheduler.step()
            trainer.optimizer.zero_grad()
        else:
            loss = None  # Handle the case where loss is not present in outputs

        # Get logits if available
        logits = outputs.logits if 'logits' in outputs else None

# Save the fine-tuned model
trainer.save_model()
