In [71]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

# Load Embeddings Data

In [72]:
cwd = os.getcwd()
embeddings = pd.read_pickle(os.path.join(cwd, "..", "data", "movie_embeddings_1.pkl"))
movie_id = embeddings.index.values.tolist()
print(embeddings.shape)
embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.362656,1.223114,0.158125,0.098262,-0.511146,0.751987,-0.083907,-0.655189,0.124555,0.398819,...,0.252393,-0.72638,0.095431,0.080516,-0.01616,0.231056,-0.469438,0.385305,0.046974,0.750637
1,-2.101079,1.202782,-1.978461,-0.782652,-0.233708,-2.028353,-0.797505,2.007848,-1.210389,-0.513851,...,-0.320754,0.995709,0.110728,-0.372982,-1.763203,-1.167182,-0.712131,0.787086,0.321424,-1.016185
2,-0.739998,1.639906,-0.821546,-0.810773,0.262396,-2.618929,-1.665061,1.418108,-0.288657,-0.164132,...,-0.749115,0.654003,0.201651,0.267222,-0.420872,0.687322,0.186,2.218797,-0.221609,-0.206429
3,0.23362,1.345427,0.200785,-1.054063,-0.793839,-2.866071,-0.859098,2.171351,-1.371101,-0.124353,...,-0.355255,0.351026,0.25468,-0.528827,-0.650816,-0.855221,0.356243,1.785845,-1.214038,-0.027784
4,-0.642298,2.164207,-0.219435,-0.764872,-0.822317,-3.844967,-0.121182,3.398263,-1.629255,-0.188076,...,-1.022821,0.926114,-0.006677,-1.271328,-0.895705,-0.809579,1.378056,0.476175,-0.452644,-0.829564


In [73]:
# Load index mappung 
with open('../data/movie_to_idx.pkl', 'rb') as handle:
    movie2idx = pickle.load(handle)

In [74]:
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings.csv"))
print("{} unique movies in ratings.csv".format(len(ratings.movieId.unique())))

movies = pd.merge(movies, ratings, on="movieId", how="inner")
movies = movies[['movieId', 'title', 'genres']]
movies.movieId = movies.movieId.apply(lambda x: movie2idx[x])
movies.drop_duplicates(inplace=True)
print("{} unique movies in embeddings".format(len(movies.movieId.unique())))
movies.set_index('movieId', inplace=True, drop=True)
movies.sort_index(ascending=True, inplace=True)
print(movies.shape)
movies.head(5)

27278 unique movies in movies.csv
26744 unique movies in ratings.csv
26744 unique movies in embeddings
(26744, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jumanji (1995),Adventure|Children|Fantasy
1,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [75]:
movies.query('title == "Abbott and Costello Meet the Keystone Kops (1955)"')

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
15816,Abbott and Costello Meet the Keystone Kops (1955),Comedy


In [76]:
latent_df = embeddings.copy()
print(latent_df.shape)
latent_df.head(3)

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.362656,1.223114,0.158125,0.098262,-0.511146,0.751987,-0.083907,-0.655189,0.124555,0.398819,...,0.252393,-0.72638,0.095431,0.080516,-0.01616,0.231056,-0.469438,0.385305,0.046974,0.750637
1,-2.101079,1.202782,-1.978461,-0.782652,-0.233708,-2.028353,-0.797505,2.007848,-1.210389,-0.513851,...,-0.320754,0.995709,0.110728,-0.372982,-1.763203,-1.167182,-0.712131,0.787086,0.321424,-1.016185
2,-0.739998,1.639906,-0.821546,-0.810773,0.262396,-2.618929,-1.665061,1.418108,-0.288657,-0.164132,...,-0.749115,0.654003,0.201651,0.267222,-0.420872,0.687322,0.186,2.218797,-0.221609,-0.206429


# Find Similar Movies in Latent Space

In [77]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = pd.DataFrame(cosine_similarity(X=latent_matrix), index=movie_id)
cosine_sim.columns = movie_id
cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26734,26735,26736,26737,26738,26739,26740,26741,26742,26743
0,1.000000,0.082784,0.006515,0.010640,0.144085,0.506335,0.124655,0.177713,0.082790,0.295171,...,-0.415457,-0.397386,0.075512,-0.271049,-0.470222,-0.411412,-0.385708,-0.351270,-0.034305,-0.054166
1,0.082784,1.000000,0.704350,0.547471,0.606900,0.245261,0.173210,0.685775,0.340246,0.289518,...,0.329212,0.286965,0.063598,0.359374,0.261190,0.215887,0.361939,0.323160,0.448288,0.445782
2,0.006515,0.704350,1.000000,0.718857,0.615183,0.224087,0.251828,0.752440,0.426732,0.290872,...,0.238112,0.120766,-0.038856,0.204142,0.149019,0.120314,0.260325,0.218208,0.404134,0.438316
3,0.010640,0.547471,0.718857,1.000000,0.766596,0.071520,0.421927,0.710511,0.541375,0.553781,...,0.146343,0.033400,-0.002178,0.120711,0.025095,0.023466,0.135550,0.105349,0.189287,0.306506
4,0.144085,0.606900,0.615183,0.766596,1.000000,0.174800,0.455064,0.686856,0.435658,0.669934,...,0.026296,-0.074263,0.011178,0.044235,-0.069371,-0.080398,0.028583,-0.031429,0.239549,0.347995
5,0.506335,0.245261,0.224087,0.071520,0.174800,1.000000,0.109236,0.259015,0.209341,0.221882,...,-0.094718,-0.093509,0.095770,0.053266,-0.119182,-0.001423,-0.058860,0.064115,0.296319,0.188135
6,0.124655,0.173210,0.251828,0.421927,0.455064,0.109236,1.000000,0.163695,0.310211,0.424576,...,0.034941,-0.072295,-0.055030,0.049758,-0.123606,-0.122201,0.007387,-0.072654,0.237812,0.248969
7,0.177713,0.685775,0.752440,0.710511,0.686856,0.259015,0.163695,1.000000,0.482255,0.439168,...,0.110518,0.024279,-0.049496,0.111680,0.027760,-0.046322,0.169192,0.125665,0.314283,0.473284
8,0.082790,0.340246,0.426732,0.541375,0.435658,0.209341,0.310211,0.482255,1.000000,0.251705,...,0.120654,0.030841,0.061287,0.199808,0.001947,0.083210,0.227676,0.195655,0.280607,0.296788
9,0.295171,0.289518,0.290872,0.553781,0.669934,0.221882,0.424576,0.439168,0.251705,1.000000,...,-0.062742,-0.115405,0.086077,-0.024954,-0.167846,-0.141034,-0.080198,-0.122030,0.052172,0.194411


In [78]:
def find_similar_movies(movie_id, cosine_sim, movies):
    
    #get similarity score vector for requested movie
    sim_series = pd.DataFrame(cosine_sim.loc[movie_id].sort_values(ascending=False))
    sim_series.columns = ["sim_score"]
    sim_series.head()
     
    #return detailed dataframe with similarity scores for reuqested movie
    sim_df = pd.merge(movies, sim_series, left_index=True, right_index=True)
    sim_df.sort_values(by="sim_score", ascending=False, inplace=True)
    return sim_df

In [80]:
movie_id = 3006 #primer
movie_id = 1195 #grease
movie_id = 131 #LOTR
#movie_id = 2087 #inception
#movie_id = 3995 #zodiac
#movie_id = 23877 #forgotton (1 rating)
#movie_id = 15816 #rated 19 times

find_similar_movies(movie_id, cosine_sim, movies).head(100)

Unnamed: 0,title,genres,sim_score
131,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,1.000000
158,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,0.972380
142,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,0.958212
998,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,0.857283
30,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,0.843314
9,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.837398
186,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,0.813750
2086,Band of Brothers (2001),Action|Drama|War,0.784859
12,"Shawshank Redemption, The (1994)",Crime|Drama,0.776003
4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0.766484
