# Save Cleansed Datasets Locally

## The Network Based Model

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ! pip install spacy
# ! pip install Levenshtein

In [134]:
import pandas as pd
import ast
import numpy as np
import random
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle
from tqdm import tqdm

### Import Files

In [135]:
ratings_df = pd.read_csv('dataset_dont_upload/ratings.csv')
links_df = pd.read_csv('dataset_dont_upload/links.csv')
ratings_df = ratings_df.merge(links_df[['movieId', 'imdbId']], on='movieId')
#add imdbID column to ratings_df
#imdbId column will match to metadata_df below to get the movie's title

In [136]:
#columns to keep
metadata_df = pd.read_csv('dataset_dont_upload/movies_metadata.csv')
columns = ['adult', 'belongs_to_collection', 'budget', 'genres', 'id',
       'imdb_id', 'original_language', 'overview',
       'popularity', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'tagline', 'title',
       'vote_average', 'vote_count']
metadata_df = metadata_df[columns]

In [137]:
credits_df = pd.read_csv('dataset_dont_upload/credits.csv')

In [138]:
#this is in directory 'Capstone Resources/CMU' and they are the data from:
#http://www.cs.cmu.edu/~ark/personas/

#df with summaries
summary_df = pd.read_csv('dataset_dont_upload/plot_summaries.txt', sep='\t', header=None)
summary_df.rename(columns={0:'wikiId', 1:'Summary'}, inplace=True)

In [139]:
#this is in directory 'Capstone Resources/CMU' and they are the data from:
# http://www.cs.cmu.edu/~ark/personas/

#df to get title from wikiId
summarykey_df = pd.read_csv('dataset_dont_upload/movie.metadata.tsv', sep='\t', header=None)

#keeping only wikiId and Title columns
summarykey_df = summarykey_df[[0, 2]]
summarykey_df.rename(columns={0:'wikiId', 2:'title'}, inplace=True)

### Data Cleaning

In [140]:
#some additional clean up

#remove any entry where the title isn't named
metadata_df.dropna(subset=['title'], inplace=True)

#let's not recommend anything X rated
metadata_df['adult'] = metadata_df['adult'].astype(str)
metadata_df = metadata_df[metadata_df['adult'] == 'False']
metadata_df.drop(columns=['adult'], inplace=True)

#modify imdb_id column to match ratings_df
metadata_df['imdb_id'] = metadata_df['imdb_id'].str[2:]
metadata_df.rename(columns={'imdb_id':'imdbId'}, inplace=True)

#change id to an integer
metadata_df['id'] = metadata_df['id'].astype('str').astype('int')

#modify genres column into a list
metadata_df['genres'] = metadata_df['genres'].astype('str')
metadata_df['genres'] = metadata_df['genres'].apply(ast.literal_eval)
metadata_df['genres'] = [[j['name'] for j in i] for i in metadata_df['genres']]

#modify belongs_to_collection column into a string if it exists
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].astype('str')
this = []
for i in metadata_df['belongs_to_collection']:
    if i == 'nan':
        this.append('None')
    else:
        try:
            dictionary = ast.literal_eval(i)
            this.append(dictionary['name'])
        except:
            this.append('None')
metadata_df['belongs_to_collection'] = this

In [141]:
#convert cast to a list of names
credits_df['cast'] = credits_df['cast'].apply(ast.literal_eval)
credits_df['cast'] = [[j['name'] for j in i] for i in credits_df['cast']]

#create a column for a list of each of the major roles
credits_df['crew'] = credits_df['crew'].apply(ast.literal_eval)
credits_df['director'] = [[j['name'] for j in i if j['job'] == 'Director'] for i in credits_df['crew']]
credits_df['writer'] = [[j['name'] for j in i if j['job'] == 'Screenplay'] for i in credits_df['crew']]
credits_df['producer'] = [[j['name'] for j in i if j['job'] == 'Producer'] for i in credits_df['crew']]
credits_df['executive producer'] = [[j['name'] for j in i if j['job'] == 'Executive Producer'] for i in credits_df['crew']]
credits_df['score'] = [[j['name'] for j in i if j['job'] == 'Music'] for i in credits_df['crew']]

credits_df.drop(columns=['crew'], inplace=True)

credits_df.head()

Unnamed: 0,cast,id,director,writer,producer,executive producer,score
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",862,[John Lasseter],"[Joss Whedon, Andrew Stanton, Joel Cohen, Alec...","[Bonnie Arnold, Ralph Guggenheim]","[Ed Catmull, Steve Jobs]",[Randy Newman]
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",8844,[Joe Johnston],"[Jonathan Hensleigh, Greg Taylor, Jim Strain]","[Scott Kroopf, William Teitler]","[Larry J. Franco, Ted Field, Robert W. Cort]",[]
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",15602,[Howard Deutch],[],[],[],[]
3,"[Whitney Houston, Angela Bassett, Loretta Devi...",31357,[Forest Whitaker],"[Ronald Bass, Terry McMillan]","[Ronald Bass, Ezra Swerdlow, Deborah Schindler...",[Terry McMillan],[]
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...",11862,[Charles Shyer],"[Nancy Meyers, Albert Hackett]",[Nancy Meyers],[],[]


In [142]:
#merge cast and crew with metadata
metadata_df = metadata_df.merge(credits_df, on='id')

In [143]:
#add title from summarykey_df to summary_df
summary_df = summary_df.merge(summarykey_df, on='wikiId')
#summary_df.drop(columns=['wikiId'], inplace=True)
summary_df['title'] = summary_df['title'].astype(str)

summary_df.head()

Unnamed: 0,wikiId,Summary,title
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",Taxi Blues
1,31186339,The nation of Panem consists of a wealthy Capi...,The Hunger Games
2,20663735,Poovalli Induchoodan is sentenced for six yea...,Narasimham
3,2231378,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid
4,595909,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark


In [144]:
#merge metadata_df and summary_df and see how many movies we lost in the merge

before_summary_df = len(metadata_df)
metadata_df = metadata_df.merge(summary_df, on = 'title')
after_summary_df = len(metadata_df)
print('Loss due to merge with summary_df: ', before_summary_df - after_summary_df)
print('Remaining: ', after_summary_df)

Loss due to merge with summary_df:  20841
Remaining:  24685


In [145]:
#Add release year to title if database contains more than one title with the same name
year = []
for date in metadata_df['release_date'].astype('str'):
    year.append(date[0:4])
metadata_df['release_year'] = year

duplicates_list = list(metadata_df['title'][metadata_df['title'].duplicated(keep=False)].unique())
for i in range(len(metadata_df)):
    if metadata_df['title'].iloc[i] in duplicates_list:
        metadata_df['title'].iloc[i] = metadata_df['title'].iloc[i] + '(' + metadata_df['release_year'].iloc[i] + ')'

In [146]:
#the merge above made multiple instances of the same movie if there were duplicate titles

metadata_df = metadata_df[metadata_df['overview'].notna()]
duplicates_list = list(metadata_df['title'][metadata_df['title'].duplicated(keep=False)].unique())

#to resolve, only keep the row from the duplicates where the overview from metadata_df and the summary from summary_df
#are the most similar
for title in duplicates_list:
    
    overview = list(metadata_df[metadata_df['title'] == title]['overview'])
    summary = list(metadata_df[metadata_df['title'] == title]['Summary'])

    sim_list = []
    for i in range(len(overview)):
        vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
        tfidf = vect.fit_transform([overview[i], summary[i]])
        sim_list.append((tfidf * tfidf.T).toarray()[0][1])

    indices = metadata_df[metadata_df['title'] == title].index
    
    to_drop = [True for i in range(len(sim_list))]
    to_drop[np.argmax(sim_list)] = False  ###drop all duplicated movies except the one where overview and summary are most similar
    
    index_to_drop = indices[to_drop]
    metadata_df.drop(index = index_to_drop, inplace=True)

metadata_df.reset_index(inplace=True)

In [147]:
len(metadata_df)

20227

In [148]:
#final number of movies in df

final_metadata_len = len(metadata_df)
print('Loss due to removal of duplicate titles: ', after_summary_df - final_metadata_len)
print('Remaining: ', final_metadata_len)

Loss due to removal of duplicate titles:  4458
Remaining:  20227


In [150]:
metadata_df.columns

Index(['index', 'belongs_to_collection', 'budget', 'genres', 'id', 'imdbId',
       'original_language', 'overview', 'popularity', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'director', 'writer', 'producer',
       'executive producer', 'score', 'wikiId', 'Summary', 'release_year'],
      dtype='object')

In [175]:
non_dups = ['index', 'belongs_to_collection', 'budget', 'genres', 'id', 'imdbId',
       'original_language', 'overview', 'popularity', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'director', 'writer',
       'producer', 'executive producer', 'score','wikiId',
       'Summary', 'release_year']

In [72]:
metadata_df.columns = non_dups

In [52]:
# metadata_df.head()

In [151]:
metadata_df.to_csv('network_metadata.csv',index=False)

### Test reading network_metadata.zip

In [180]:
metadata_df = pd.read_csv('dataset/network_metadata.zip', compression='zip', header=0, sep=',')[non_dups].set_index('index')

In [194]:
# metadata_df.head()

### Save Networkx Graph

In [204]:
import ast
import pickle

In [202]:
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: ast.literal_eval(x))

In [203]:
import networkx as nx
from tqdm import tqdm
def create_network(df, column, min_num_occ = 2):
    df = df[['title', column]]
    df = df.explode(column)
    df.set_index('title', inplace=True)
    unique_vals = df[column].value_counts()[list(df[column].value_counts() > min_num_occ)]
    edge_list = set()
    for i in tqdm(unique_vals.index):
        movies = df.loc[df[column] == i].index
        for j in movies:
            for k in movies:
                edge_list.add((j,k))
    G = nx.Graph()
    G.add_edges_from(edge_list)
    return G
G = create_network(metadata_df, 'cast', 2)

100%|██████████| 24803/24803 [08:41<00:00, 47.55it/s]


In [218]:
pickle.dump(G, open('graph.txt', 'wb'))

### Test - Read Graph from file

In [223]:
G_read = pickle.load(open('dataset/graph.txt','rb'))

In [230]:
G.nodes() == G_read.nodes()

True

## Collaborative Filtering Based Model

### Read files

In [43]:
raw_meta = pd.read_csv('dataset/movies_metadata.csv')  
raw_rt_sml = pd.read_csv('dataset/ratings_small.csv') 

In [44]:
dpl_list = pd.concat(g for _, g in raw_meta.groupby("id") if len(g) > 1).id.unique().tolist()
raw_meta = raw_meta.loc[~raw_meta['id'].isin(dpl_list)]
meta_df = raw_meta[['id','overview','runtime','title','release_date', 'genres','vote_average']] #
meta_df.dropna(how='any',inplace=True)
movieid_title = meta_df[['id','title']]
movieid_title['id'] = movieid_title['id'].astype(int)
# keep only movies that can find names by id
inter_movies_sml = list(set(movieid_title.id.unique().tolist()).intersection(set(raw_rt_sml.movieId.unique().tolist())))
movieid_title = movieid_title.loc[movieid_title['id'].isin(inter_movies_sml)]
movie_lookup_dict = dict(zip(movieid_title.id.tolist(), movieid_title.title.tolist())) #movie id and movie name dict

rt_df_sml = raw_rt_sml.loc[raw_rt_sml['movieId'].isin(inter_movies_sml)]
rt_df_sml.loc[rt_df_sml['rating']>3,'like'] = 10
rt_df_sml.loc[rt_df_sml['rating']<=3,'like'] = -10
rt_df_sml.userId = rt_df_sml.userId.apply(lambda x: 'U'+str(x))

# conver to pivot table
user_movie_pivot = rt_df_sml.pivot_table(index='userId', columns='movieId', values='like').fillna(0)

# convert dataframe of movie features to scipy sparse matrix
# user_movie_features = csr_matrix(user_movie_pivot.values)
# # build and train the model
# model_nn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=7, n_jobs=-1)
# model_nn.fit(user_movie_features)
# # get the list of all movie id
# movie_list = user_movie_pivot.columns.tolist()

In [48]:
user_movie_pivot.to_csv('user_movie_pivot.csv')