<a href="https://colab.research.google.com/github/agraves13/AAI520/blob/main/AAI520_Final_Group_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##########################################################################################
## **AAI-520**                                                                                 #
## **Final Project - Group 6**                                                                 #
## **Chatbot for Movie Info utilizing the Cornell Movie Dialogs Corpus**        
##########################################################################################

In [22]:
#@title 1: Load the related Libraries
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import codecs
import csv
import os

In [23]:
#@title 2: Load Movie Data Corpus
#Load the Line file

def loadLines(filePath, fields):
    """
    Args:
        filePath (str): full path to the file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each line
    """
    lines = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]

            lines[lineObj['lineID']] = lineObj

    return lines

# Usage example
fields_to_extract = ['lineID', 'characterID', 'movieID', 'character', 'text']
file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_lines.txt"
lines_data = loadLines(file_path, fields_to_extract)

In [24]:
#Load the Charachter file
def loadCharacterMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the character metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each character
    """
    characters = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            characterObj = {}
            for i, field in enumerate(fields):
                characterObj[field] = values[i]

            characters[characterObj['characterID']] = characterObj

    return characters

# Usage example
character_fields_to_extract = ['characterID', 'characterName', 'movieID', 'movieTitle', 'gender', 'position']
character_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_characters_metadata.txt"
character_data = loadCharacterMetadata(character_file_path, character_fields_to_extract)

In [25]:
# Load Conversation file
def loadConversations(filePath, fields):
    """
    Args:
        filePath (str): full path to the conversations file to load
        fields (set<str>): fields to extract
    Return:
        list<dict<str>>: a list of dictionaries representing conversations
    """
    conversations = []

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            conversationObj = {}
            for i, field in enumerate(fields):
                conversationObj[field] = values[i]

            # Parse the list of line IDs into a list of integers
            conversationObj['lineIDs'] = [line_id.strip()[1:-1] for line_id in conversationObj['utteranceIDs'][1:-1].split(",")]

            conversations.append(conversationObj)

    return conversations

# Usage example
conversation_fields_to_extract = ['characterID', 'characterID', 'movieID', 'utteranceIDs']
conversation_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_conversations.txt"
conversation_data = loadConversations(conversation_file_path, conversation_fields_to_extract)

In [26]:
# Load the title file
def loadMovieTitlesMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the movie titles metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each movie title
    """
    movie_titles = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            movieTitleObj = {}
            for i, field in enumerate(fields):
                movieTitleObj[field] = values[i]

            movie_titles[movieTitleObj['movieID']] = movieTitleObj

    return movie_titles

# Usage example
movie_title_fields_to_extract = ['movieID', 'movieTitle', 'releaseYear', 'imdbRating', 'numVotes']
movie_title_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/movie_titles_metadata.txt"
movie_title_data = loadMovieTitlesMetadata(movie_title_file_path, movie_title_fields_to_extract)

In [27]:
# Load the URL file
def loadRawScriptUrls(filePath):
    """
    Args:
        filePath (str): full path to the raw script URLs file to load
    Return:
        list<str>: a list of URLs
    """
    urls = []

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            url = line.strip()  # Remove leading/trailing whitespace
            urls.append(url)

    return urls

# Usage example
raw_script_urls_file_path = "/content/drive/MyDrive/Colab Notebooks/AAI520-Data/raw_script_urls.txt"
script_urls = loadRawScriptUrls(raw_script_urls_file_path)

In [28]:
# Verify one of the files to see if loaded properly:
# Print some sample movie title data to check if it was loaded
for movie_id, movie_info in movie_title_data.items():
    print(f"Movie ID: {movie_info['movieID']}")
    print(f"Movie Title: {movie_info['movieTitle']}")
    print(f"Release Year: {movie_info['releaseYear']}")
    print(f"IMDB Rating: {movie_info['imdbRating']}")
    print(f"Number of Votes: {movie_info['numVotes']}")
    print("\n")

Movie ID: m0
Movie Title: 10 things i hate about you
Release Year: 1999
IMDB Rating: 6.90
Number of Votes: 62847


Movie ID: m1
Movie Title: 1492: conquest of paradise
Release Year: 1992
IMDB Rating: 6.20
Number of Votes: 10421


Movie ID: m2
Movie Title: 15 minutes
Release Year: 2001
IMDB Rating: 6.10
Number of Votes: 25854


Movie ID: m3
Movie Title: 2001: a space odyssey
Release Year: 1968
IMDB Rating: 8.40
Number of Votes: 163227


Movie ID: m4
Movie Title: 48 hrs.
Release Year: 1982
IMDB Rating: 6.90
Number of Votes: 22289


Movie ID: m5
Movie Title: the fifth element
Release Year: 1997
IMDB Rating: 7.50
Number of Votes: 133756


Movie ID: m6
Movie Title: 8mm
Release Year: 1999
IMDB Rating: 6.30
Number of Votes: 48212


Movie ID: m7
Movie Title: a nightmare on elm street 4: the dream master
Release Year: 1988
IMDB Rating: 5.20
Number of Votes: 13590


Movie ID: m8
Movie Title: a nightmare on elm street: the dream child
Release Year: 1989
IMDB Rating: 4.70
Number of Votes: 11092


