In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

# Load Embeddings Data

In [2]:
cwd = os.getcwd()
embeddings = pd.read_pickle(os.path.join(cwd, "..", "data", "autoencoder_embeddings.pkl"))
embeddings = pd.DataFrame(embeddings)
movie_id = embeddings.index.values.tolist()
print(embeddings.shape)
embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.013344,0.23956,1.050268,0.0,0.023284,0.396033,0.0,0.692792,0.0,0.0,...,0.002741,0.0,0.265989,0.372963,0.0,0.0,0.0,0.711748,0.0,0.379516
1,0.0,0.172851,1.251562,0.47129,0.065289,0.398064,0.0074,1.077674,0.14371,0.0,...,0.0,0.040959,0.07553,0.0,0.0,0.0,0.0,0.555058,0.397117,0.333755
2,0.00101,0.925408,0.063987,0.0,0.133182,0.030352,0.104917,0.622351,0.045847,0.0,...,0.0,0.242329,0.0,0.422629,0.0,0.0,0.0,0.494329,0.182441,0.093916
3,0.0,0.94424,0.607588,0.368766,0.166914,0.0,0.226098,0.0,0.481809,0.0,...,0.0,0.0,0.124727,0.465437,0.014911,0.0,0.0,0.605971,0.80661,0.387506
4,0.0,0.734243,0.627565,0.150901,0.304607,0.236698,0.051425,0.408785,0.367336,0.586794,...,0.0,0.0,0.08308,0.182237,0.0,0.0,0.0,0.08075,0.826122,0.54248


In [3]:
sum(embeddings.sum()==0)

0

In [4]:
# Load index mappung 
with open('../data/movie_to_idx.pkl', 'rb') as handle:
    movie2idx = pickle.load(handle)

In [5]:
movies = pd.read_csv(os.path.join(cwd, "..", "data/ml-20m", "movies.csv"))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

ratings = pd.read_csv(os.path.join(cwd, "..", "data/ml-20m", "ratings.csv"))
print("{} unique movies in ratings.csv".format(len(ratings.movieId.unique())))

movies = pd.merge(movies, ratings, on="movieId", how="inner")
movies = movies[['movieId', 'title', 'genres']]
movies.movieId = movies.movieId.apply(lambda x: movie2idx[x])
movies.drop_duplicates(inplace=True)
print("{} unique movies in embeddings".format(len(movies.movieId.unique())))
movies.set_index('movieId', inplace=True, drop=True)
movies.sort_index(ascending=True, inplace=True)
print(movies.shape)
movies.head(5)

27278 unique movies in movies.csv
26744 unique movies in ratings.csv
26744 unique movies in embeddings
(26744, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jumanji (1995),Adventure|Children|Fantasy
1,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
movies.query('title == "Abbott and Costello Meet the Keystone Kops (1955)"')

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
15816,Abbott and Costello Meet the Keystone Kops (1955),Comedy


In [7]:
latent_df = embeddings.copy()
print(latent_df.shape)
latent_df.head(3)

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.013344,0.23956,1.050268,0.0,0.023284,0.396033,0.0,0.692792,0.0,0.0,...,0.002741,0.0,0.265989,0.372963,0.0,0.0,0.0,0.711748,0.0,0.379516
1,0.0,0.172851,1.251562,0.47129,0.065289,0.398064,0.0074,1.077674,0.14371,0.0,...,0.0,0.040959,0.07553,0.0,0.0,0.0,0.0,0.555058,0.397117,0.333755
2,0.00101,0.925408,0.063987,0.0,0.133182,0.030352,0.104917,0.622351,0.045847,0.0,...,0.0,0.242329,0.0,0.422629,0.0,0.0,0.0,0.494329,0.182441,0.093916


# Find Similar Movies in Latent Space

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = pd.DataFrame(cosine_similarity(X=latent_df), index=movie_id)
cosine_sim.columns = movie_id
cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26734,26735,26736,26737,26738,26739,26740,26741,26742,26743
0,1.000000,0.700087,0.586348,0.593992,0.652156,0.539630,0.612356,0.617107,0.637966,0.629171,...,0.407517,0.372937,0.025203,0.563334,0.025203,0.025203,0.630526,0.701835,0.594525,0.495798
1,0.700087,1.000000,0.597531,0.535820,0.554938,0.514646,0.505120,0.523794,0.627651,0.538821,...,0.344387,0.474425,0.039783,0.519243,0.039783,0.039783,0.626526,0.654736,0.568123,0.527450
2,0.586348,0.597531,1.000000,0.599405,0.589186,0.456458,0.511229,0.662328,0.743096,0.663474,...,0.237062,0.404815,0.094312,0.537385,0.094312,0.094312,0.570910,0.525705,0.475387,0.672953
3,0.593992,0.535820,0.599405,1.000000,0.790583,0.561083,0.588293,0.676711,0.713948,0.670496,...,0.413193,0.461029,0.047253,0.472880,0.047253,0.047253,0.731429,0.581131,0.552833,0.528951
4,0.652156,0.554938,0.589186,0.790583,1.000000,0.549374,0.552415,0.702432,0.604131,0.543392,...,0.409971,0.350977,0.061885,0.508594,0.061885,0.061885,0.670990,0.591727,0.514200,0.595905
5,0.539630,0.514646,0.456458,0.561083,0.549374,1.000000,0.515964,0.579095,0.621392,0.546978,...,0.526528,0.397352,0.047491,0.521035,0.047491,0.047491,0.606124,0.541269,0.631014,0.559000
6,0.612356,0.505120,0.511229,0.588293,0.552415,0.515964,1.000000,0.546431,0.604044,0.686374,...,0.504253,0.496210,0.028379,0.558113,0.028379,0.028379,0.676638,0.622861,0.591546,0.574315
7,0.617107,0.523794,0.662328,0.676711,0.702432,0.579095,0.546431,1.000000,0.764983,0.681061,...,0.342390,0.452272,0.092219,0.527479,0.092219,0.092219,0.639430,0.577125,0.474195,0.707591
8,0.637966,0.627651,0.743096,0.713948,0.604131,0.621392,0.604044,0.764983,1.000000,0.700858,...,0.311106,0.529578,0.051555,0.507326,0.051555,0.051555,0.627781,0.533979,0.471670,0.599772
9,0.629171,0.538821,0.663474,0.670496,0.543392,0.546978,0.686374,0.681061,0.700858,1.000000,...,0.361959,0.582110,0.047813,0.507032,0.047813,0.047813,0.661561,0.574182,0.622002,0.493544


In [9]:
def find_similar_movies(movie_id, cosine_sim, movies):
    
    #get similarity score vector for requested movie
    sim_series = pd.DataFrame(cosine_sim.loc[movie_id].sort_values(ascending=False))
    sim_series.columns = ["sim_score"]
    sim_series.head()
     
    #return detailed dataframe with similarity scores for reuqested movie
    sim_df = pd.merge(movies, sim_series, left_index=True, right_index=True)
    sim_df.sort_values(by="sim_score", ascending=False, inplace=True)
    return sim_df

In [11]:
# movie_id = 3006 #primer
# movie_id = 1195 #grease
#movie_id = 131 #LOTR
#movie_id = 2087 #inception
movie_id = 3995 #zodiac
#movie_id = 23877 #forgotton (1 rating)
#movie_id = 15816 #rated 19 times

find_similar_movies(movie_id, cosine_sim, movies).head(100)

Unnamed: 0,title,genres,sim_score
3995,Zodiac (2007),Crime|Drama|Thriller,1.000000
2571,Hannibal Rising (2007),Drama|Horror|Thriller,0.803062
643,Natural Born Killers (1994),Action|Crime|Thriller,0.798921
8002,Balance (1989),Animation|Drama|Mystery|Sci-Fi|Thriller,0.793809
3438,Barefoot in the Park (1967),Comedy,0.792481
16218,Kiwi! (2006),Action|Animation,0.792138
16245,Mulberry Street (2006),Action|Horror|Thriller,0.784865
2656,Inside Llewyn Davis (2013),Drama,0.784815
25393,Evidence (1995),Documentary,0.784388
19503,"Fall of the House of Usher, The (Zánik domu Us...",Animation,0.783707
