In [1]:
import pandas as pd
import numpy as np
from surprise import dump
import json, io, time

In [2]:
## Retrive top 20 movies sorted by rating
# train
df_train = pd.read_csv('../Data/train.csv')
df_train = df_train[~df_train['rating'].isna()]
df_train = df_train.sort_values('rating',ascending=False).drop_duplicates(['user_id', 'movieid'])

# validation
df_val = pd.read_csv('../Data/val.csv')
df_val = df_val[~df_val['rating'].isna()]
df_val = df_val.sort_values('rating',ascending=False).drop_duplicates(['user_id', 'movieid'])

# top 20 movies required to recommend to unseen user
df_movie = pd.concat([df_train,df_val]).groupby('movieid')['rating'].agg(['sum','count'])
df_movie['sum_product'] = df_movie['sum'] * df_movie['count']
df_movie['avg_rating'] = df_movie['sum'] / df_movie['count']
top20_movie = df_movie.sort_values(by='avg_rating', ascending=False)[:20].index.tolist()

# save top 20 movies to json file
with open('../Data/top20_movie.json', 'w', encoding='utf-8') as f:
    json.dump(top20_movie, f, ensure_ascii=False)

# prediction function
def get_recommend_movie(model, user_id, n = 20):
    #if the user in the training set
    if (model.trainset.knows_user(user_id)):
        # get movie list
        movie_list = []
        for i in range (model.trainset.n_items):
            movie_list.append(model.trainset.to_raw_iid(i))

        # get predicted rating for every movie
        result = []
        for movie in movie_list:
            prediction = model.predict(user_id, movie)
            uid = prediction.uid
            movie = prediction.iid
            rating = prediction.est
            result.append((uid,movie,rating))

        # list of watched movie
        watched_movie = []
        for iid,_ in model.trainset.ur[user_id]:
            movie_name = model.trainset.to_raw_iid(iid)
            watched_movie.append(movie_name)
        
        result_df = pd.DataFrame(result, columns=['user_id','movie_id','predicted_rating']).sort_values(by = 'predicted_rating', ascending=False)
        result_df = result_df[~result_df['movie_id'].isin(watched_movie)][:n]
        return result_df.movie_id.values.tolist()
        
    # else return top 20 movies
    else:
        # load precalculated top 20 movie list
        with open('../Data/top20_movie.json', 'r') as f:
            data = f.read()    
        top20_movie = json.loads(data)

        return top20_movie
    

In [3]:
%%time
# load pre-trained model    
model = dump.load('./SVD_V1')[1]

CPU times: user 1.34 s, sys: 219 ms, total: 1.56 s
Wall time: 1.56 s


In [4]:
%%time
#seen user
get_recommend_movie(model, 23092, 20)

CPU times: user 95.5 ms, sys: 8.79 ms, total: 104 ms
Wall time: 107 ms


['the+usual+suspects+1995',
 'the+godfather+part+ii+1974',
 'rear+window+1954',
 'dr.+strangelove+or+how+i+learned+to+stop+worrying+and+love+the+bomb+1964',
 'louis+c.k.+shameless+2007',
 'schindlers+list+1993',
 'the+lives+of+others+2006',
 'aparajito+1956',
 'louis+c.k.+live+at+the+beacon+theater+2011',
 'lacombe_+lucien+1974',
 '42+up+1998',
 'amlie+2001',
 'the+wrong+trousers+1993',
 'life+is+beautiful+1997',
 'one+day+in+september+1999',
 'louis+c.k.+oh+my+god+2013',
 'the+sorrow+and+the+pity+1969',
 'the+intouchables+2011',
 'the+silence+of+the+lambs+1991',
 'some+folks+call+it+a+sling+blade+1994']

In [5]:
%%time
#unseen user
get_recommend_movie(model, 123243578924203546, 20)

CPU times: user 248 µs, sys: 320 µs, total: 568 µs
Wall time: 353 µs


['the+shawshank+redemption+1994',
 'the+usual+suspects+1995',
 'the+godfather+1972',
 'schindlers+list+1993',
 'rear+window+1954',
 'the+godfather+part+ii+1974',
 'the+lives+of+others+2006',
 'pulp+fiction+1994',
 'raiders+of+the+lost+ark+1981',
 'the+lord+of+the+rings+the+return+of+the+king+2003',
 'dr.+strangelove+or+how+i+learned+to+stop+worrying+and+love+the+bomb+1964',
 'the+lord+of+the+rings+the+fellowship+of+the+ring+2001',
 'star+wars+1977',
 'aparajito+1956',
 'the+lord+of+the+rings+the+two+towers+2002',
 'fight+club+1999',
 'the+empire+strikes+back+1980',
 'the+wrong+trousers+1993',
 'one+flew+over+the+cuckoos+nest+1975',
 'ordet+1955']

# Average prediction time computed over 100 prediction requests

In [6]:
time_list = []
for i in range(100):
    user_id = np.random.randint(low = 0, high = model.trainset.all_users()[-1])
    start_time = time.time()
    get_recommend_movie(model, user_id, 20)
    time_list.append((time.time() - start_time))
    # print("--- %s seconds ---" % (time.time() - start_time))

In [7]:
np.mean(time_list)

0.08785532474517822

In [8]:
np.median(time_list)

0.09083497524261475