In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Import Dataset

from movie.csv

In [None]:
r_cols = ['movie_id', 'ranking', 'movie_title', 'country', 'release_year', 'film_rate', 'runtime', 'synopsis']

df1 = pd.read_csv('movie.csv',  sep='|', names=r_cols, encoding='utf-8-sig', skiprows=1)

# creating a new copy so that any changes to original data will not affect the train and test set below
full_data1 = df1.copy()

from keywords.csv

In [None]:
s_cols = ['keyword_id', 'keywords', 'movie_id']

df2 = pd.read_csv('keywords.csv',  sep='|', names=s_cols, encoding='utf-8-sig', skiprows=1)

# concatenate and group the keywords for each movie ID
full_data2 = df2.groupby('movie_id').agg({'keywords': lambda x: ' '.join(x)}).reset_index()

In [None]:
# movies without keywords
df1[~df1['movie_id'].isin(df2['movie_id'])][['movie_id', 'movie_title', 'ranking']]

from plotsummary.csv

In [None]:
p_cols = ['plot_id', 'movie_id', 'plot_text']

df3 = pd.read_csv('plotsummary.csv',  sep='|', names=p_cols, encoding='utf-8-sig', skiprows=1)

# concatenate and group the plot for each movie ID
full_data3 = df3.groupby('movie_id').agg({'plot_text': lambda x: ' '.join(x)}).reset_index()

Top 10 Movies

In [None]:
# list out the Top 10 movies by ranking
top10_movies = full_data1[['ranking', 'movie_id', 'movie_title']].sort_values(by='ranking').head(10)
top10_movies

# Question 1

# Q1: NLP Process

In [None]:
# remove punctuations
removal = str.maketrans('', '', string.punctuation)
full_data2['keywords'] = full_data2['keywords'].str.translate(removal)

# Q1: Calculate TF

In [None]:
# keywords with length less than 2 will be filtered out here
tf_q1 = CountVectorizer()
matrix_q1 = tf_q1.fit_transform(full_data2['keywords'])

# Q1: Calculate cosine similarity based on TF matrix

In [None]:
cosine_sim_q1 = cosine_similarity(matrix_q1, matrix_q1)
cosine_sim_q1_df = pd.DataFrame(cosine_sim_q1, columns=full_data2.movie_id, index=full_data2.movie_id)

# Q1: 10 Closest Movies Recommendation for Top 10 Movies

In [None]:
# create a data frame
results_q1 = pd.DataFrame(columns=['movie_name', 'movie_ID', '10_closest_movies'])

# recommendation loop
for j, k in zip(top10_movies['movie_title'], top10_movies['movie_id']):
    
    # sort the cosine similarity in descending order
    recommend_df_q1 = pd.DataFrame(cosine_sim_q1_df[k].sort_values(ascending=False)) 
    
    # reset index and rename the columns
    recommend_df_q1.reset_index(level=0, inplace=True)
    recommend_df_q1.columns = ['movie_id','distance_score']

    # take the 10 nearest movies, excluding the movie itself
    recommend_df_q1 = recommend_df_q1[recommend_df_q1['movie_id'] != k].head(10)
    
    # insert a new column for storing movie_title, map it from 'movies.csv'
    recommend_df_q1.insert(loc=0, column='movie_title', value='')
    recommend_df_q1['movie_title'] = recommend_df_q1['movie_id'].map(full_data1.set_index('movie_id')['movie_title'])
    
    # create a list of tuples in the order of movie_title, movie_id and distance_score
    recommend_list_q1 = list(recommend_df_q1.itertuples(index=False, name=None))
    
    # append row to the dataframe
    new_row_q1 = {'movie_name':j, 'movie_ID':k, '10_closest_movies':recommend_list_q1}
    results_q1 = results_q1.append(new_row_q1, ignore_index=True)

In [None]:
pd.set_option('max_colwidth', None)
results_q1

# output to csv
# results_q1.to_csv (r'C:\Users\Owner\Desktop\question1.csv', index = False, header=True)

# Question 2

Merging both synopsis and plot into one column for cleaning later

In [None]:
synopsis_plot = pd.merge(full_data1, full_data3, on="movie_id", how="left")

# fill in the NaN values with white space especially those without plot or synopsis, or both
synopsis_plot.fillna(' ', inplace=True)

# create a new column for storing combined terms from synopsis and plot
synopsis_plot['combined'] = ''
synopsis_plot['combined'] = synopsis_plot['synopsis'] + ' ' + synopsis_plot['plot_text']

# remove unnecessary columns for calculation afterwards
synopsis_plot = synopsis_plot.drop(columns=['synopsis', 'plot_text', 'runtime', 'film_rate', 'release_year'])

In [None]:
# movies without plot AND synopsis
print("Number of movies without plot AND synopsis: ", end="")
print(synopsis_plot['movie_id'][synopsis_plot['combined'].str.strip(' ') == ''].count())
print('\n\n', 'List of movies without plot AND synopsis:')
print(synopsis_plot[['movie_id', 'movie_title', 'ranking']][synopsis_plot['combined'].str.strip(' ') == ''])

# remove movies without plot and synopsis
synopsis_plot.drop(synopsis_plot.loc[synopsis_plot['combined'].str.strip(' ') == ''].index, inplace=True)

# Q2: NLP Processes

1. Text Tokenization and Cleaning using Combined Terms from Synopsis and Plot

In [None]:
%%time
# create a list with lists of filtered keywords of each movie_id
filtered_keywords_q2 = []

# recommendation loop
for i in synopsis_plot['combined']:
    # replace " !--Line Break--! " with whitespace
    raw_text = re.sub("!--Line Break--!", " ", i)

    # split by whitespace
    tokens = raw_text.split()

    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # normalizing case
    words = [w.lower() for w in stripped]

    # filter out stop words ==FIRST TIME==
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    # lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w) for w in words]

    # filter out stop words ==SECOND TIME==
    words = [w for w in lemmatized if not w in stop_words]

    # stemming of words
    porter = PorterStemmer()
    stemmed = [porter.stem(w) for w in words]
    
    filtered_keywords_q2.append(stemmed)

2. Create a new dataframe with only movie ID and filtered keywords

In [None]:
synopsis_plot_update = pd.DataFrame(synopsis_plot[['movie_id']], 
                                 columns=['movie_id', 'keywords'])
synopsis_plot_update['keywords'] = filtered_keywords_q2

# array of keywords to be used in TfidfVectorizer later
synopsis_plot_update['keywords'] = synopsis_plot_update['keywords'].apply(' '.join)

# Q2: Calculate TF-IDF

In [None]:
# other stop words match the ones from TfidfVectorizer will be filtered out again here
# keywords with length less than 2 will be filtered out too
tf_q2 = TfidfVectorizer(stop_words='english')
matrix_q2 = tf_q2.fit_transform(synopsis_plot_update['keywords'])

In [None]:
# built-in stop words from TfidfVectorizer
sets=[tf_q2.get_stop_words()]
print([list(x) for x in sets])

# Q2: Calculate cosine similarity based on TF-IDF matrix

In [None]:
cosine_sim_q2 = linear_kernel(matrix_q2, matrix_q2)
cosine_sim_q2_df = pd.DataFrame(cosine_sim_q2, columns=synopsis_plot_update.movie_id, index=synopsis_plot_update.movie_id)

# Q2: 10 Closest Movies Recommendation for Top 10 Movies

In [None]:
results_q2 = pd.DataFrame(columns=['movie_name', 'movie_ID', '10_closest_movies'])

# recommendation loop
for j, k in zip(top10_movies['movie_title'], top10_movies['movie_id']):
    
    # sort the cosine similarity in descending order
    recommend_df_q2 = pd.DataFrame(cosine_sim_q2_df[k].sort_values(ascending=False)) 
    
    # reset index and rename columns
    recommend_df_q2.reset_index(level=0, inplace=True)
    recommend_df_q2.columns = ['movie_id','distance_score']

    # take the 10 nearest movies, excluding the movie itself
    recommend_df_q2 = recommend_df_q2[recommend_df_q2['movie_id'] != k].head(10)
    
    # insert a new column for storing movie_title, map it from 'movies.csv'
    recommend_df_q2.insert(loc=0, column='movie_title', value='')
    recommend_df_q2['movie_title'] = recommend_df_q2['movie_id'].map(full_data1.set_index('movie_id')['movie_title'])

    # create a list of tuples in the order of movie_title, movie_id and distance_score
    recommend_list_q2 = list(recommend_df_q2.itertuples(index=False, name=None))
    
    #append row to the dataframe
    new_row_q2 = {'movie_name':j, 'movie_ID':k, '10_closest_movies':recommend_list_q2}
    results_q2 = results_q2.append(new_row_q2, ignore_index=True)

In [None]:
pd.set_option('max_colwidth', None)
results_q2

# output as csv file
# results_q2.to_csv (r'C:\Users\Owner\Desktop\question2.csv', index = False, header=True)