In [1]:
import os

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requests

In [2]:
def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


def find_similar_movies(movie_id, X, k, movie_mapper, movie_inv_mapper, metric='cosine', show_distance=False):
    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k += 1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1, -1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0, k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [3]:
def separate_train_test(movie_db):
    freq_db = movie_db.groupby("userId").filter(lambda x: len(x) >= 5)
    freq_db['rank_latest'] = freq_db.groupby(['userId'])['rating'].rank(method='first', ascending=False)
    
    train = freq_db[freq_db['rank_latest'] != 1]
    test  = freq_db[freq_db['rank_latest'] == 1]
    
    return train, test

In [4]:
def train(train_data):
    ratings = train_data
    X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


In [5]:
rating =  pd.read_csv("rating.csv", names= ["userId", "movieId", "rating"] , header = None)
db_train, db_test = separate_train_test(rating)
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = train(db_train)

In [6]:
db_test.head()

Unnamed: 0,userId,movieId,rating,rank_latest
6,291504,pretty+woman+1990,4,1.0
13,597885,invasion+of+the+body+snatchers+1956,5,1.0
16,851114,rush+hour+1998,4,1.0
17,408433,the+bourne+identity+2002,5,1.0
18,923906,richard+iii+1995,5,1.0


In [7]:
num_rec = 20
def simple_check(X, movie_mapper, movie_inv_mapper, user_id, target_movie):
    movie_id = db_train[db_train['userId'] == user_id].iloc[-1]['movieId']
    similar_ids = find_similar_movies(movie_id, X, num_rec, movie_mapper, movie_inv_mapper)
#     rec = ",".join(similar_ids)
    return target_movie in similar_ids

In [8]:
total_smaple = 10000
small_db_test = db_test.sample(n = total_smaple)
small_db_test['hit'] = small_db_test.apply(lambda f:
                             simple_check(X, movie_mapper, movie_inv_mapper, f['userId'], f['movieId']), axis  =1)

In [13]:
hit_rate = (small_db_test[small_db_test['hit']== True].count() / total_smaple)[0]

In [17]:
print('hit rate @ {} is {}'.format(num_rec, hit_rate))


hit rate @ 20 is 0.0904


In [18]:
hit_rate

0.0904