In [1]:
# Import necessary modules
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from functools import reduce
from IPython.display import HTML
from os.path import exists
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
import gc
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import sys

sys.path.insert(0, '..')
from cf_utils import ALSRecommender
from cf_utils import DampedUserMovieBaselineModel
from cf_utils import KNNRecommender
from cf_utils import ndcg_from_df
from cf_utils import SGDRecommender

# Use custom matplotlib stylesheet
plt.style.use('./big-darkgrid.mplstyle')

In [3]:
ratings_df = pd.read_csv('raw/ml-100k/u.data', sep='\t', header=None, usecols=[0,1,2],
                         names=['userId', 'movieId', 'rating'])
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [4]:
links_df = pd.read_csv('raw/links.csv', dtype=str).set_index('movieId', drop=True)
links_df.head()

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,114709,862
2,113497,8844
3,113228,15602
4,114885,31357
5,113041,11862


In [5]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, random_state=0)
splits = [
    (train_inds, test_inds)
    for train_inds, test_inds in skf.split(ratings_df, ratings_df['userId'])
]

In [6]:
baseline_algo = DampedUserMovieBaselineModel(damping_factor=10)
reg = 0.0
models_dict = {
    'user': KNNRecommender(mode='user', k=20, baseline_algo=baseline_algo),
    'item': KNNRecommender(mode='item', k=10, baseline_algo=baseline_algo),
    'als': ALSRecommender(k=5, lmbda=0.1, max_epochs=15, baseline_algo=baseline_algo, verbose=False),
    'sgd': SGDRecommender(k=50, learning_rate=0.01, max_epochs=30, damping_factor=10,
                          user_reg=reg, item_reg=reg, user_bias_reg=reg, item_bias_reg=reg,
                          verbose=False)
}

In [7]:
# You'll need to go to themoviedb.org to set up an api_key if you want to run this
api_key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
headers = {'Accept': 'application/json'}
payload = {'api_key': api_key} 
response = requests.get(
    "http://api.themoviedb.org/3/configuration",
    params=payload,
    headers=headers,
)
response = json.loads(response.text)
base_url = response['images']['base_url'] + 'w185'

In [8]:
def get_poster_url(movieId, base_url, links_df, api_key):
    movieId = str(int(movieId))
    # Get IMDB movie ID
    tmdbId = links_df.loc[movieId, 'tmdbId']
    
    # Query themoviedb.org API for movie poster path.
    movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(tmdbId)
    headers = {'Accept': 'application/json'}
    payload = {'api_key': api_key} 
    response = requests.get(movie_url, params=payload, headers=headers)
    file_path = json.loads(response.text)['posters'][0]['file_path']
        
    return base_url + file_path

def display_posters(movieIds, base_url, links_df, api_key):
    poster_urls = [get_poster_url(movieId, base_url, links_df, api_key) for movieId in movieIds]
    TABLE = "<table style='width: 100%; align: center;'><tr>{}</tr></table>"
    CELL = "<td align='center'><img style='float: left; width: 120px' src={}></td>"
    table = TABLE.format(''.join([CELL.format(url) for url in poster_urls]))
    display(HTML(table))

In [9]:
def recommend(model, train_df, user, pretrained=False, k=3):
    train_df = train_df.iloc[:, :3].copy()
    train_df.columns = ['user', 'item', 'rating']
    if not pretrained:
        model.fit(train_df)
    seen_movies = train_df[train_df['user'] == user]['item'].unique()
    unseen_movies = list(set(train_df['item'].unique()) - set(seen_movies))
    user_movie_df = pd.DataFrame({'user': [user]*len(unseen_movies), 'item': unseen_movies})
    user_movie_df = user_movie_df[['user', 'item']]
    user_movie_df['pred'] = model.predict(user_movie_df)
    user_movie_df = user_movie_df.sort_values('pred', ascending=False)
    movies, preds = user_movie_df[['item', 'pred']].values[:3, :].T
    return movies, preds

## Movies this user likes

In [10]:
user = 100
train_inds, test_inds = splits[0]
train_df, test_df = ratings_df.iloc[train_inds], ratings_df.iloc[test_inds]
favorite_movies = (
    train_df[train_df['userId']==user]
    .sort_values('rating', ascending=False)
    .iloc[:5, 1]
    .values
)
display_posters(favorite_movies, base_url, links_df, api_key)

## Recommended movies

In [12]:
model = models_dict['sgd']
movies, preds = recommend(model, train_df, user)
display_posters(movies, base_url, links_df, api_key)