In [19]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

# Load Content Embeddings

In [20]:
cwd = os.getcwd()
content_embeddings = pd.read_pickle(os.path.join(cwd, "..", "data", "autoencoder_embeddings.pkl"))
content_embeddings = pd.DataFrame(content_embeddings)
print(content_embeddings.shape)
content_embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.013344,0.23956,1.050268,0.0,0.023284,0.396033,0.0,0.692792,0.0,0.0,...,0.002741,0.0,0.265989,0.372963,0.0,0.0,0.0,0.711748,0.0,0.379516
1,0.0,0.172851,1.251562,0.47129,0.065289,0.398064,0.0074,1.077674,0.14371,0.0,...,0.0,0.040959,0.07553,0.0,0.0,0.0,0.0,0.555058,0.397117,0.333755
2,0.00101,0.925408,0.063987,0.0,0.133182,0.030352,0.104917,0.622351,0.045847,0.0,...,0.0,0.242329,0.0,0.422629,0.0,0.0,0.0,0.494329,0.182441,0.093916
3,0.0,0.94424,0.607588,0.368766,0.166914,0.0,0.226098,0.0,0.481809,0.0,...,0.0,0.0,0.124727,0.465437,0.014911,0.0,0.0,0.605971,0.80661,0.387506
4,0.0,0.734243,0.627565,0.150901,0.304607,0.236698,0.051425,0.408785,0.367336,0.586794,...,0.0,0.0,0.08308,0.182237,0.0,0.0,0.0,0.08075,0.826122,0.54248


# Load Collaborative Embeddings

In [21]:
cwd = os.getcwd()
collaborative_embeddings = pd.read_pickle(os.path.join(cwd, "..", "data", "movie_embeddings_1.pkl"))
print(collaborative_embeddings.shape)
collaborative_embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.362656,1.223114,0.158125,0.098262,-0.511146,0.751987,-0.083907,-0.655189,0.124555,0.398819,...,0.252393,-0.72638,0.095431,0.080516,-0.01616,0.231056,-0.469438,0.385305,0.046974,0.750637
1,-2.101079,1.202782,-1.978461,-0.782652,-0.233708,-2.028353,-0.797505,2.007848,-1.210389,-0.513851,...,-0.320754,0.995709,0.110728,-0.372982,-1.763203,-1.167182,-0.712131,0.787086,0.321424,-1.016185
2,-0.739998,1.639906,-0.821546,-0.810773,0.262396,-2.618929,-1.665061,1.418108,-0.288657,-0.164132,...,-0.749115,0.654003,0.201651,0.267222,-0.420872,0.687322,0.186,2.218797,-0.221609,-0.206429
3,0.23362,1.345427,0.200785,-1.054063,-0.793839,-2.866071,-0.859098,2.171351,-1.371101,-0.124353,...,-0.355255,0.351026,0.25468,-0.528827,-0.650816,-0.855221,0.356243,1.785845,-1.214038,-0.027784
4,-0.642298,2.164207,-0.219435,-0.764872,-0.822317,-3.844967,-0.121182,3.398263,-1.629255,-0.188076,...,-1.022821,0.926114,-0.006677,-1.271328,-0.895705,-0.809579,1.378056,0.476175,-0.452644,-0.829564


# Format Movie Lookup Data

In [None]:
# Load index mapping 
with open('../data/movie_to_idx.pkl', 'rb') as handle:
    movie2idx = pickle.load(handle)

In [22]:
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings.csv"))
print("{} unique movies in ratings.csv".format(len(ratings.movieId.unique())))

movies = pd.merge(movies, ratings, on="movieId", how="inner")
movies.movieId = movies.movieId.apply(lambda x: movie2idx[x])

#get popularity
popularity = pd.DataFrame(movies[['userId', 'title', 'movieId']].groupby(['title', 'movieId']).agg(['count']))
popularity.reset_index(inplace=True)
popularity.columns = ['title', 'movieId', 'ratings_count']
popularity.sort_values('ratings_count', ascending=False, inplace=True)
movies = pd.merge(popularity[['movieId', 'ratings_count']], movies, on='movieId')
movies.reset_index(inplace=True)

#get average ratings
average_ratings = pd.DataFrame(movies[['rating', 'title', 'movieId']].groupby(['title', 'movieId']).agg(['mean']))
average_ratings.reset_index(inplace=True)
average_ratings.columns = ['title', 'movieId', 'avg_rating']
movies = pd.merge(average_ratings[['movieId', 'avg_rating']], movies, on='movieId')
movies.reset_index(inplace=True)


movies = movies[['movieId', 'title', 'genres', 'ratings_count', 'avg_rating']]
movies.drop_duplicates(inplace=True)
print("{} unique movies in embeddings".format(len(movies.movieId.unique())))
movies.set_index('movieId', inplace=True, drop=True)
movies.sort_index(ascending=True, inplace=True)
print(movies.shape)
movies.head(5)

27278 unique movies in movies.csv
26744 unique movies in ratings.csv
26744 unique movies in embeddings
(26744, 4)


Unnamed: 0_level_0,title,genres,ratings_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Jumanji (1995),Adventure|Children|Fantasy,22243,3.211977
1,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,8520,3.95223
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,44980,3.898055
3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,43249,4.053493
4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,47006,4.334372


In [23]:
movies.query('title == "Zodiac (2007)"')

Unnamed: 0_level_0,title,genres,ratings_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3995,Zodiac (2007),Crime|Drama|Thriller,3907,3.675454


# Concatinate latent Tags and Embeddings

In [None]:
from sklearn import preprocessing

#normalize ensembeled dimensions
x = np.concatenate((content_embeddings, collaborative_embeddings), axis=1)
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
latent_df = pd.DataFrame(x_scaled)

In [None]:
import os; import sys
cwd = os.getcwd()
path = os.path.join(cwd, '..', 'movie_recommender')
if not path in sys.path:
    sys.path.append(path)
del cwd, path

from similarity import SimilarityPredictions

In [None]:
sim_model_ens = SimilarityPredictions(embeddings=latent_df)

In [None]:
sim_model_cont = SimilarityPredictions(embeddings=content_embeddings)

In [None]:
sim_model_coll = SimilarityPredictions(embeddings=collaborative_embeddings)

In [None]:
#movie_id = 3006 #primer
#movie_id = 1195 #grease
movie_id = 131 #LOTR
#movie_id = 2087 #inception
#movie_id = 3995 #zodiac
#movie_id = 23877 #forgotton (1 rating)
#movie_id = 15816 #rated 19 times

# Ensemble recommendations

In [None]:
output = sim_model_ens.predict_similar_items(seed_item=movie_id, n=20)

In [None]:
similar_movies = pd.DataFrame(output)
similar_movies.set_index('item_id', inplace=True)
sim_df = pd.merge(movies, similar_movies, left_index=True, right_index=True)
sim_df.sort_values('similarity_score', ascending=False, inplace=True)
sim_df

# Content Recommendations

In [None]:
output = sim_model_cont.predict_similar_items(seed_item=movie_id, n=20)

In [None]:
similar_movies = pd.DataFrame(output)
similar_movies.set_index('item_id', inplace=True)
sim_df = pd.merge(movies, similar_movies, left_index=True, right_index=True)
sim_df.sort_values('similarity_score', ascending=False, inplace=True)
sim_df

# Collaborative Recommendations

In [None]:
output = sim_model_coll.predict_similar_items(seed_item=movie_id, n=20)

In [None]:
similar_movies = pd.DataFrame(output)
similar_movies.set_index('item_id', inplace=True)
sim_df = pd.merge(movies, similar_movies, left_index=True, right_index=True)
sim_df.sort_values('similarity_score', ascending=False, inplace=True)
sim_df