#############################################################################################
## **AAI-520**                                                                                 #
## **Final Project - Group 6**                                                                 #
## **Chatbot for Movie Info utilizing the Cornell Movie Dialogs Corpus**        

This Jupyter Notebook is used to generate the training dataset that will be used to the train the LLM ChatBot

#############################################################################################

In [73]:
#@title 1: Load the related Libraries
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import codecs
import csv
import os
import pandas as pd
import ast

##2: Load Movie Data Corpus

In [74]:
is_on_colab = False

google_drive = "/content/drive/MyDrive/AAI-520/Final/Data"
local_dir = "./Dataset/Cornell_Movie_Dialog_Corpus/"

dataset_dir = local_dir
if (is_on_colab):
    dataset_dir = google_drive

train_dir = dataset_dir + "../Train/"
if not os.path.exists(train_dir):
    os.makedirs(train_dir)

In [75]:
#@title 2.1: Load line_data
#Load the Line file

def loadLines(filePath, fields):
    """
    Args:
        filePath (str): full path to the file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each line
    """
    lines = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]

            lines[lineObj['lineID']] = lineObj

    return lines

# Usage example
fields_to_extract = ['lineID', 'characterID', 'movieID', 'character', 'text']
file_path = dataset_dir + "/movie_lines.txt"
lines_data = loadLines(file_path, fields_to_extract)

In [76]:
#@title 2.2: Load the charachter_data
def loadCharacterMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the character metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each character
    """
    characters = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            characterObj = {}
            for i, field in enumerate(fields):
                characterObj[field] = values[i]

            characters[characterObj['characterID']] = characterObj

    return characters

# Usage example
character_fields_to_extract = ['characterID', 'characterName', 'movieID', 'movieTitle', 'gender', 'position']
character_file_path = dataset_dir + "/movie_characters_metadata.txt"
character_data = loadCharacterMetadata(character_file_path, character_fields_to_extract)

In [77]:
#@title 2.3: Load conversation_data
def loadConversations(filePath, fields):
    """
    Args:
        filePath (str): full path to the conversations file to load
        fields (set<str>): fields to extract
    Return:
        list<dict<str>>: a list of dictionaries representing conversations
    """

    conversations = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            conversationObj = {}
            for i, field in enumerate(fields):
                conversationObj[field] = values[i]


            conversations[conversationObj['movieID']] = conversationObj

    return conversations

# Usage example
conversation_fields_to_extract = ['characterID1', 'characterID2', 'movieID', 'utteranceIDs']
conversation_file_path = dataset_dir + "/movie_conversations.txt"
conversation_data = loadConversations(conversation_file_path, conversation_fields_to_extract)

In [78]:
#@title 2.4: Load the title_data
def loadMovieTitlesMetadata(filePath, fields):
    """
    Args:
        filePath (str): full path to the movie titles metadata file to load
        fields (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each movie title
    """
    movie_titles = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            movieTitleObj = {}
            for i, field in enumerate(fields):
                movieTitleObj[field] = values[i]

            movie_titles[movieTitleObj['movieID']] = movieTitleObj

    return movie_titles

# Usage example
movie_title_fields_to_extract = ['movieID', 'movieTitle', 'releaseYear', 'imdbRating', 'numVotes', 'genres']
movie_title_file_path = dataset_dir + "/movie_titles_metadata.txt"
movie_title_data = loadMovieTitlesMetadata(movie_title_file_path, movie_title_fields_to_extract)

In [79]:
#@title 2.5: Load the url_data
def loadRawScriptUrls(filePath, fields):
    """
    Args:
        filePath (str): full path to the raw script URLs file to load
        fields (list<str>): fields to extract
    Return:
        dict<str, dict<str>>: a dictionary with movieID as keys and dictionaries with field values as values
    """
    urls = {}

    with open(filePath, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            loadRawScriptUrls = {}
            for i, field in enumerate(fields):
                loadRawScriptUrls[field] = values[i]

            urls[loadRawScriptUrls['movieID']] = loadRawScriptUrls

    return urls

# Usage example
raw_script_urls_fields_to_extract = ['movieID', 'scriptURL', 'url']
raw_script_urls_file_path = dataset_dir + "/raw_script_urls.txt"
script_urls_data = loadRawScriptUrls(raw_script_urls_file_path, raw_script_urls_fields_to_extract)


In [80]:
#@title 2.6: Verify the loading of the data (replace array name and fields as needed)
# Print some sample movie title data to check if it was loaded
#for movie_id, movie_info in movie_title_data.items():
    #print(f"Movie ID: {movie_info['movieID']}")
    #print(f"Movie Title: {movie_info['movieTitle']}")
    #print(f"Release Year: {movie_info['releaseYear']}")
    #print(f"IMDB Rating: {movie_info['imdbRating']}")
    #print(f"Number of Votes: {movie_info['numVotes']}")
    #print("\n")

In [81]:
#@title 2.7: Verify the loading of the url data
#for movie_id, url_info in script_urls_data.items():
    #print(f"Movie ID: {url_info['movieID']}")
    #print(f"Movie Title: {url_info['scriptURL']}")
    #print(f"url: {url_info['url']}")
    #print("\n")

In [82]:
#@title 3: Convert dictionaries/arrays to DataFrames
df_lines = pd.DataFrame.from_dict(lines_data, orient='index')

df_characters = pd.DataFrame.from_dict(character_data, orient='index')
gender_map = {
    'f': 'female',
    'm': 'male',
    '?': 'unknown'
}
df_characters['gender'] = df_characters['gender'].map(gender_map)

df_conversations = pd.DataFrame.from_dict(conversation_data, orient='index')

df_movie_titles = pd.DataFrame.from_dict(movie_title_data, orient='index')
df_movie_titles['genres'] = df_movie_titles['genres'].apply(ast.literal_eval)

df_script_urls =  pd.DataFrame.from_dict(script_urls_data, orient='index')


In [83]:
#@title 3.1: Verify df (change names as needed)
#df_script_urls.head()

In [84]:
#@title 4: Merge DataFrames based on Movie ID
merged_data = df_lines.merge(df_characters, on='movieID', how='inner')
merged_data = merged_data.merge(df_conversations, on='movieID', how='inner')
merged_data = merged_data.merge(df_movie_titles, on='movieID', how='inner')
merged_data = merged_data.merge(df_script_urls, on='movieID', how='inner')
#drop duplicate fields
merged_data = merged_data.drop("movieTitle_y", axis=1)
merged_data = merged_data.drop("scriptURL", axis=1)

In [85]:
#@title 4.1: Verify merged_data df
merged_data.head(5)

Unnamed: 0,lineID,characterID_x,movieID,character,text,characterID_y,characterName,movieTitle_x,gender,position,characterID1,characterID2,utteranceIDs,releaseYear,imdbRating,numVotes,genres,url
0,L1045,u0,m0,BIANCA,They do not!\n,u0,BIANCA,10 things i hate about you,female,4\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,"[comedy, romance]",http://www.dailyscript.com/scripts/10Things.ht...
1,L1045,u0,m0,BIANCA,They do not!\n,u1,BRUCE,10 things i hate about you,unknown,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,"[comedy, romance]",http://www.dailyscript.com/scripts/10Things.ht...
2,L1045,u0,m0,BIANCA,They do not!\n,u2,CAMERON,10 things i hate about you,male,3\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,"[comedy, romance]",http://www.dailyscript.com/scripts/10Things.ht...
3,L1045,u0,m0,BIANCA,They do not!\n,u3,CHASTITY,10 things i hate about you,unknown,?\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,"[comedy, romance]",http://www.dailyscript.com/scripts/10Things.ht...
4,L1045,u0,m0,BIANCA,They do not!\n,u4,JOEY,10 things i hate about you,male,6\n,u10,u11,"['L929', 'L930', 'L931', 'L932', 'L933']\n",1999,6.9,62847,"[comedy, romance]",http://www.dailyscript.com/scripts/10Things.ht...


In [86]:
#@title 4.2: Verify the number of rows in the DataFrame
num_rows = merged_data.shape[0]

# Print the number of rows
print("Number of Rows:", num_rows)

Number of Rows: 4874548


# Generate the Train dataset

### Example text data: 

```Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Give three tips for staying healthy. ### Response: 1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 2. Exercise regularly to keep your body active and strong. 3. Get enough sleep and maintain a consistent sleep schedule.```

In [87]:
# Initialize an empty train list
questions_list = []

def print_last_question():
    for question in questions_list[-1:]:
        print(question)

### When was movie released?

In [88]:
template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
When was {} released?

### Response:
{} was released in {}
"""

for title, year in zip(df_movie_titles["movieTitle"], df_movie_titles["releaseYear"]):
    formatted_text = template.format(title, title, year)
    questions_list.append(formatted_text)

print_last_question();


Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
When was zulu dawn released?

### Response:
zulu dawn was released in 1979



# what is the rating on the movie {}?

In [89]:
template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
what is the rating on the movie {}?

### Response:
The rating on {} is {}
"""

for title, imdb in zip(df_movie_titles["movieTitle"], df_movie_titles["imdbRating"]):
    formatted_text = template.format(title, title, imdb)
    questions_list.append(formatted_text)


print_last_question();


Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
what is the rating on the movie zulu dawn?

### Response:
The rating on zulu dawn is 6.40



# what is the genres on the movie {}? 

In [90]:
template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
what is the genre of the movie {}?

### Response:
The genres for the movie {} are {}
"""

for title, genres in zip(df_movie_titles["movieTitle"], df_movie_titles["genres"]):
    # Convert list of genres to a comma-separated string
    if len(genres) > 1:
        genres_str = ', '.join(genres[:-2]) + ', ' + genres[-2] + ', and ' + genres[-1]
    elif (len(genres) == 1):
        genres_str = genres[0]
    else:
        genres_str = "unknown"
    formatted_text = template.format(title, title, genres_str)
    questions_list.append(formatted_text)

print_last_question();



Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
what is the genre of the movie zulu dawn?

### Response:
The genres for the movie zulu dawn are action, adventure, drama, history, and war



In [91]:
df_characters.head(5)

Unnamed: 0,characterID,characterName,movieID,movieTitle,gender,position
u0,u0,BIANCA,m0,10 things i hate about you,female,4\n
u1,u1,BRUCE,m0,10 things i hate about you,unknown,?\n
u2,u2,CAMERON,m0,10 things i hate about you,male,3\n
u3,u3,CHASTITY,m0,10 things i hate about you,unknown,?\n
u4,u4,JOEY,m0,10 things i hate about you,male,6\n


In [92]:
# what gender is the character {} in the movie {}? 

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
what gender is the character {} in the movie {}? 

### Response:
{}'s gender is {}
"""

for character, title, gender in zip(df_characters["characterName"], df_characters["movieTitle"], df_characters["gender"]):
    formatted_text = template.format(character, title, character, gender)
    questions_list.append(formatted_text)

print_last_question();


Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
what gender is the character VEREKER in the movie zulu dawn? 

### Response:
VEREKER's gender is unknown



In [93]:
# do you have the full script for the movie {}? 

template = """
Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
do you have the full script for the movie {}?  

### Response:
sure you can find it here {}
"""

for title, url in zip(df_script_urls["scriptURL"], df_script_urls["url"]):
    formatted_text = template.format(title, url)
    questions_list.append(formatted_text)

print_last_question();


Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

### Instruction:
do you have the full script for the movie zulu dawn?  

### Response:
sure you can find it here http://www.aellea.com/script/zuludawn.txt




In [94]:
# # what are some line that {} said in the movie {}? 

# template = """
# Below is an instruction that describes a movie related question. Write a response that appropriately answers the question.

# ### Instruction:
# what are some line that {} said in the movie {}? 

# ### Response:
# {} said \"{}\"
# """

# for character, title, text in zip(df_lines["character"], df_movie_titles["movieTitle"], df_lines["text"]):
#     formatted_text = template.format(character, title, character, text.replace("\n", " "))
#     questions_list.append(formatted_text)

# print_last_question();

In [95]:
## Todo: Add additional training questions

In [96]:
# Store the csv for training
df_from_list = pd.DataFrame({'text': questions_list})
df_from_list.to_csv(train_dir + "train.csv", index=True)