In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

# Load Embeddings Data

In [5]:
cwd = os.getcwd()
embeddings = pd.read_pickle(os.path.join(cwd, "..", "data", "autoencoder_embeddings.pkl"))
embeddings = pd.DataFrame(embeddings)
movie_id = embeddings.index.values.tolist()
print(embeddings.shape)
embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.621827,0.0,0.0,0.449949,0.62723,0.386803,0.043507,1.31747,0.0,0.477331,...,0.920025,0.0,0.0,0.0,0.0,0.776488,0.498048,0.761823,0.668633,0.0
1,0.483226,0.0,0.0,0.424218,0.790704,0.437543,0.026677,1.061941,0.118236,0.414725,...,0.834894,0.0,0.0,0.0,0.0,0.585571,0.171385,0.638835,0.904632,0.0
2,0.578031,0.0,0.0,0.399628,1.061998,0.353823,0.041634,0.940179,0.007272,0.270786,...,1.063713,0.0,0.0,0.0,0.0,0.705903,0.064461,0.805923,0.864924,0.0
3,0.760374,0.0,0.0,0.122506,1.013673,0.318113,0.01154,0.705913,0.296866,0.56964,...,0.52732,0.0,0.0,0.0,0.0,0.778261,0.261091,0.650009,0.717019,0.0
4,0.801676,0.0,0.0,0.138528,0.920463,0.338334,0.013075,0.817977,0.303473,0.640478,...,0.520126,0.0,0.0,0.0,0.0,0.834012,0.402337,0.676847,0.668968,0.0


In [23]:
sum(embeddings.sum()==0)

26

In [6]:
# Load index mappung 
with open('../data/movie_to_idx.pkl', 'rb') as handle:
    movie2idx = pickle.load(handle)

In [11]:
movies = pd.read_csv(os.path.join(cwd, "..", "data/ml-20m", "movies.csv"))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

ratings = pd.read_csv(os.path.join(cwd, "..", "data/ml-20m", "ratings.csv"))
print("{} unique movies in ratings.csv".format(len(ratings.movieId.unique())))

movies = pd.merge(movies, ratings, on="movieId", how="inner")
movies = movies[['movieId', 'title', 'genres']]
movies.movieId = movies.movieId.apply(lambda x: movie2idx[x])
movies.drop_duplicates(inplace=True)
print("{} unique movies in embeddings".format(len(movies.movieId.unique())))
movies.set_index('movieId', inplace=True, drop=True)
movies.sort_index(ascending=True, inplace=True)
print(movies.shape)
movies.head(5)

27278 unique movies in movies.csv
26744 unique movies in ratings.csv
26744 unique movies in embeddings
(26744, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jumanji (1995),Adventure|Children|Fantasy
1,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [12]:
movies.query('title == "Abbott and Costello Meet the Keystone Kops (1955)"')

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
15816,Abbott and Costello Meet the Keystone Kops (1955),Comedy


In [13]:
latent_df = embeddings.copy()
print(latent_df.shape)
latent_df.head(3)

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.621827,0.0,0.0,0.449949,0.62723,0.386803,0.043507,1.31747,0.0,0.477331,...,0.920025,0.0,0.0,0.0,0.0,0.776488,0.498048,0.761823,0.668633,0.0
1,0.483226,0.0,0.0,0.424218,0.790704,0.437543,0.026677,1.061941,0.118236,0.414725,...,0.834894,0.0,0.0,0.0,0.0,0.585571,0.171385,0.638835,0.904632,0.0
2,0.578031,0.0,0.0,0.399628,1.061998,0.353823,0.041634,0.940179,0.007272,0.270786,...,1.063713,0.0,0.0,0.0,0.0,0.705903,0.064461,0.805923,0.864924,0.0


# Find Similar Movies in Latent Space

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = pd.DataFrame(cosine_similarity(X=latent_df), index=movie_id)
cosine_sim.columns = movie_id
cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26734,26735,26736,26737,26738,26739,26740,26741,26742,26743
0,1.000000,0.977712,0.959982,0.888069,0.910094,0.927844,0.934967,0.958456,0.934922,0.957404,...,0.638494,0.791567,0.927664,0.965759,0.927664,0.927664,0.887303,0.785432,0.725195,0.913231
1,0.977712,1.000000,0.976822,0.937795,0.947069,0.928929,0.908081,0.983629,0.933941,0.961940,...,0.601690,0.810738,0.951573,0.924049,0.951573,0.951573,0.921676,0.714184,0.722781,0.914430
2,0.959982,0.976822,1.000000,0.936952,0.936998,0.900726,0.907259,0.938996,0.968301,0.993100,...,0.535074,0.872129,0.915635,0.924153,0.915635,0.915635,0.826882,0.718832,0.659094,0.885410
3,0.888069,0.937795,0.936952,1.000000,0.996753,0.958980,0.915512,0.950988,0.878349,0.897602,...,0.679767,0.803102,0.968294,0.825823,0.968294,0.968294,0.878509,0.743553,0.827793,0.943748
4,0.910094,0.947069,0.936998,0.996753,1.000000,0.977728,0.937933,0.965681,0.882647,0.898650,...,0.721119,0.795520,0.982379,0.851026,0.982379,0.982379,0.900087,0.779006,0.855930,0.964071
5,0.927844,0.928929,0.900726,0.958980,0.977728,1.000000,0.958190,0.958965,0.845148,0.868366,...,0.794253,0.721592,0.978409,0.866239,0.978409,0.978409,0.919312,0.836199,0.913954,0.977838
6,0.934967,0.908081,0.907259,0.915512,0.937933,0.958190,1.000000,0.933888,0.920233,0.876591,...,0.824483,0.843572,0.958692,0.945976,0.958692,0.958692,0.824302,0.935233,0.840093,0.982116
7,0.958456,0.983629,0.938996,0.950988,0.965681,0.958965,0.933888,1.000000,0.903974,0.906971,...,0.720116,0.793071,0.986503,0.911546,0.986503,0.986503,0.953837,0.771199,0.811975,0.960500
8,0.934922,0.933941,0.968301,0.878349,0.882647,0.845148,0.920233,0.903974,1.000000,0.956694,...,0.576868,0.953133,0.887940,0.964296,0.887940,0.887940,0.743516,0.788913,0.597191,0.877827
9,0.957404,0.961940,0.993100,0.897602,0.898650,0.868366,0.876591,0.906971,0.956694,1.000000,...,0.467978,0.844753,0.872347,0.917069,0.872347,0.872347,0.795099,0.682041,0.603833,0.840341


In [16]:
def find_similar_movies(movie_id, cosine_sim, movies):
    
    #get similarity score vector for requested movie
    sim_series = pd.DataFrame(cosine_sim.loc[movie_id].sort_values(ascending=False))
    sim_series.columns = ["sim_score"]
    sim_series.head()
     
    #return detailed dataframe with similarity scores for reuqested movie
    sim_df = pd.merge(movies, sim_series, left_index=True, right_index=True)
    sim_df.sort_values(by="sim_score", ascending=False, inplace=True)
    return sim_df

In [19]:
movie_id = 3006 #primer
movie_id = 1195 #grease
#movie_id = 131 #LOTR
#movie_id = 2087 #inception
movie_id = 3995 #zodiac
#movie_id = 23877 #forgotton (1 rating)
#movie_id = 15816 #rated 19 times

find_similar_movies(movie_id, cosine_sim, movies).head(100)

Unnamed: 0,title,genres,sim_score
3995,Zodiac (2007),Crime|Drama|Thriller,1.000000
2510,Touching the Void (2003),Adventure|Documentary,0.998115
2001,"Right Stuff, The (1983)",Drama,0.998088
6176,"Informant!, The (2009)",Comedy|Crime|Drama|Thriller,0.997854
1105,October Sky (1999),Drama,0.997839
5231,Captain Phillips (2013),Adventure|Drama|Thriller|IMAX,0.997819
8107,Dogtooth (Kynodontas) (2009),Drama,0.997436
1050,Disclosure (1994),Drama|Thriller,0.997339
478,Donnie Brasco (1997),Crime|Drama,0.997079
5253,Unstoppable (2010),Action|Drama|Thriller,0.996846
