<a href="https://colab.research.google.com/github/agraves13/AAI520/blob/main/AAI520_Final_Group_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#############################################################################################
## **AAI-520**                                                                                 #
## **Final Project - Group 6**                                                                 #
## **Chatbot for Movie Info utilizing the Cornell Movie Dialogs Corpus**        
#############################################################################################

In [3]:
#@title 1: Load the related Libraries
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import codecs
import csv
import os
import pandas as pd

##2: Load Movie Data Corpus

In [4]:
#@title 2.1: Load line_data
#Load the Line file

def loadLines(filePath, fields):
    """
    Args:
        filePath (str): full path to the file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each line
    """
    lines = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]

            lines[lineObj['lineID']] = lineObj

    return lines

# Usage example
fields_to_extract = ['lineID', 'characterID', 'movieID', 'character', 'text']
file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_lines.txt"
lines_data = loadLines(file_path, fields_to_extract)

In [5]:
#@title 2.2: Load the charachter_data
def loadCharacterMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the character metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each character
    """
    characters = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            characterObj = {}
            for i, field in enumerate(fields):
                characterObj[field] = values[i]

            characters[characterObj['characterID']] = characterObj

    return characters

# Usage example
character_fields_to_extract = ['characterID', 'characterName', 'movieID', 'movieTitle', 'gender', 'position']
character_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_characters_metadata.txt"
character_data = loadCharacterMetadata(character_file_path, character_fields_to_extract)

In [6]:
#@title 2.3: Load conversation_data
def loadConversations(filePath, fields):
    """
    Args:
        filePath (str): full path to the conversations file to load
        fields (set<str>): fields to extract
    Return:
        list<dict<str>>: a list of dictionaries representing conversations
    """

    conversations = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            conversationObj = {}
            for i, field in enumerate(fields):
                conversationObj[field] = values[i]


            conversations[conversationObj['movieID']] = conversationObj

    return conversations

# Usage example
conversation_fields_to_extract = ['characterID1', 'characterID2', 'movieID', 'utteranceIDs']
conversation_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_conversations.txt"
conversation_data = loadConversations(conversation_file_path, conversation_fields_to_extract)

In [7]:
#@title 2.4: Load the title_data
def loadMovieTitlesMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the movie titles metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each movie title
    """
    movie_titles = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            movieTitleObj = {}
            for i, field in enumerate(fields):
                movieTitleObj[field] = values[i]

            movie_titles[movieTitleObj['movieID']] = movieTitleObj

    return movie_titles

# Usage example
movie_title_fields_to_extract = ['movieID', 'movieTitle', 'releaseYear', 'imdbRating', 'numVotes']
movie_title_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_titles_metadata.txt"
movie_title_data = loadMovieTitlesMetadata(movie_title_file_path, movie_title_fields_to_extract)

In [8]:
#@title 2.5: Load the url_data
def loadRawScriptUrls(filePath, fields):
    """
    Args:
        filePath (str): full path to the raw script URLs file to load
        fields (list<str>): fields to extract
    Return:
        dict<str, dict<str>>: a dictionary with movieID as keys and dictionaries with field values as values
    """
    urls = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            loadRawScriptUrls = {}
            for i, field in enumerate(fields):
                loadRawScriptUrls[field] = values[i]

            urls[loadRawScriptUrls['movieID']] = loadRawScriptUrls

    return urls

# Usage example
raw_script_urls_fields_to_extract = ['movieID', 'scriptURL', 'url']
raw_script_urls_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/raw_script_urls.txt"
script_urls_data = loadRawScriptUrls(raw_script_urls_file_path, raw_script_urls_fields_to_extract)


In [9]:
#@title 2.6: Verify the loading of the data (replace array name and fields as needed)
# Print some sample movie title data to check if it was loaded
for movie_id, movie_info in movie_title_data.items():
    print(f"Movie ID: {movie_info['movieID']}")
    print(f"Movie Title: {movie_info['movieTitle']}")
    print(f"Release Year: {movie_info['releaseYear']}")
    print(f"IMDB Rating: {movie_info['imdbRating']}")
    print(f"Number of Votes: {movie_info['numVotes']}")
    print("\n")

Movie ID: m0
Movie Title: 10 things i hate about you
Release Year: 1999
IMDB Rating: 6.90
Number of Votes: 62847


Movie ID: m1
Movie Title: 1492: conquest of paradise
Release Year: 1992
IMDB Rating: 6.20
Number of Votes: 10421


Movie ID: m2
Movie Title: 15 minutes
Release Year: 2001
IMDB Rating: 6.10
Number of Votes: 25854


Movie ID: m3
Movie Title: 2001: a space odyssey
Release Year: 1968
IMDB Rating: 8.40
Number of Votes: 163227


Movie ID: m4
Movie Title: 48 hrs.
Release Year: 1982
IMDB Rating: 6.90
Number of Votes: 22289


Movie ID: m5
Movie Title: the fifth element
Release Year: 1997
IMDB Rating: 7.50
Number of Votes: 133756


Movie ID: m6
Movie Title: 8mm
Release Year: 1999
IMDB Rating: 6.30
Number of Votes: 48212


Movie ID: m7
Movie Title: a nightmare on elm street 4: the dream master
Release Year: 1988
IMDB Rating: 5.20
Number of Votes: 13590


Movie ID: m8
Movie Title: a nightmare on elm street: the dream child
Release Year: 1989
IMDB Rating: 4.70
Number of Votes: 11092




In [10]:
#@title 2.7: Verify the loading of the url data
for movie_id, url_info in script_urls_data.items():
    print(f"Movie ID: {url_info['movieID']}")
    print(f"Movie Title: {url_info['scriptURL']}")
    print(f"url: {url_info['url']}")
    print("\n")

Movie ID: m0
Movie Title: 10 things i hate about you
url: http://www.dailyscript.com/scripts/10Things.html



Movie ID: m1
Movie Title: 1492: conquest of paradise
url: http://www.hundland.org/scripts/1492-ConquestOfParadise.txt



Movie ID: m2
Movie Title: 15 minutes
url: http://www.dailyscript.com/scripts/15minutes.html



Movie ID: m3
Movie Title: 2001: a space odyssey
url: http://www.scifiscripts.com/scripts/2001.txt



Movie ID: m4
Movie Title: 48 hrs.
url: http://www.awesomefilm.com/script/48hours.txt



Movie ID: m5
Movie Title: the fifth element
url: http://www.scifiscripts.com/scripts/5thelement.txt



Movie ID: m6
Movie Title: 8mm
url: http://www.dailyscript.com/scripts/eight-millimeter.html



Movie ID: m7
Movie Title: a nightmare on elm street 4: the dream master
url: http://www.hundland.org/scripts/A-Nightmare-on-Elm-Street-4.txt



Movie ID: m8
Movie Title: a nightmare on elm street: the dream child
url: http://www.hundland.org/scripts/A-Nightmare-on-Elm-Street-5.txt



Mo

In [11]:
#@title 3: Convert dictionaries/arrays to DataFrames
df_lines = pd.DataFrame.from_dict(lines_data, orient='index')
df_characters = pd.DataFrame.from_dict(character_data, orient='index')
df_conversations = pd.DataFrame.from_dict(conversation_data, orient='index')
df_movie_titles = pd.DataFrame.from_dict(movie_title_data, orient='index')
df_script_urls =  pd.DataFrame.from_dict(script_urls_data, orient='index')


In [12]:
#@title 3.1: Verify df (change names as needed)
df_script_urls.head()

Unnamed: 0,movieID,scriptURL,url
m0,m0,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
m1,m1,1492: conquest of paradise,http://www.hundland.org/scripts/1492-ConquestO...
m2,m2,15 minutes,http://www.dailyscript.com/scripts/15minutes.h...
m3,m3,2001: a space odyssey,http://www.scifiscripts.com/scripts/2001.txt\n
m4,m4,48 hrs.,http://www.awesomefilm.com/script/48hours.txt\n


In [13]:
#@title 4: Merge DataFrames based on Movie ID
merged_data = df_lines.merge(df_characters, on='movieID', how='inner')
merged_data = merged_data.merge(df_conversations, on='movieID', how='inner')
merged_data = merged_data.merge(df_movie_titles, on='movieID', how='inner')
merged_data = merged_data.merge(df_script_urls, on='movieID', how='inner') #right_index=True,

In [14]:
#@title 4.1: Verify merged_data df
merged_data.head(200)

Unnamed: 0,lineID,characterID_x,movieID,character,text,characterID_y,characterName,movieTitle_x,gender,position,characterID1,characterID2,utteranceIDs,movieTitle_y,releaseYear,imdbRating,numVotes,scriptURL,url
0,L1045,u0,m0,BIANCA,They do not!\n,u0,BIANCA,10 things i hate about you,f,4\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
1,L1045,u0,m0,BIANCA,They do not!\n,u1,BRUCE,10 things i hate about you,?,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
2,L1045,u0,m0,BIANCA,They do not!\n,u2,CAMERON,10 things i hate about you,m,3\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
3,L1045,u0,m0,BIANCA,They do not!\n,u3,CHASTITY,10 things i hate about you,?,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
4,L1045,u0,m0,BIANCA,They do not!\n,u4,JOEY,10 things i hate about you,m,6\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,L862,u0,m0,BIANCA,do you listen to this crap?\n,u3,CHASTITY,10 things i hate about you,?,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
196,L862,u0,m0,BIANCA,do you listen to this crap?\n,u4,JOEY,10 things i hate about you,m,6\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
197,L862,u0,m0,BIANCA,do you listen to this crap?\n,u5,KAT,10 things i hate about you,f,2\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...
198,L862,u0,m0,BIANCA,do you listen to this crap?\n,u6,MANDELLA,10 things i hate about you,f,7\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",10 things i hate about you,1999,6.90,62847,10 things i hate about you,http://www.dailyscript.com/scripts/10Things.ht...


In [15]:
#@title 4.2: Verify the number of rows in the DataFrame
num_rows = merged_data.shape[0]

# Print the number of rows
print("Number of Rows:", num_rows)

Number of Rows: 4874548
