In [13]:
import os

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requests


def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


def find_similar_movies(movie_id, X, k, movie_mapper, movie_inv_mapper, metric='cosine', show_distance=False):
    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k += 1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1, -1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0, k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

def train():
    rating =  pd.read_csv("rating.csv", names= ["userId", "movieId", "rating"] , header = None)
    ratings = rating
    X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

def cache(X, movie_mapper, movie_inv_mapper):
    print("Starting the cache")
    rating = pd.read_csv("rating.csv",
                         names=["userId", "movieId", "rating"], header=None)
    # For all users in rating

    api_dict = {
        "api_key": "",
        "user_recommendations": []
    }
    count = 0
    for user_id in rating['userId'].unique():
        try:
            movie_id = rating[rating['userId'] == user_id].iloc[-1]['movieId']
            similar_ids = find_similar_movies(movie_id, X, 50, movie_mapper, movie_inv_mapper)
            rec = ",".join(similar_ids)
            api_dict["user_recommendations"].append({"user_id": str(user_id), "recommend": rec})

        except Exception as e:
            print(f"{type(e).__name__} at line {e.__traceback__.tb_lineno} of {__file__}: {e}")

In [14]:
%%time

# CSR metric generation ~ Training time (8 seconds)
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = train()
# cache(X, movie_mapper, movie_inv_mapper)


CPU times: user 6.29 s, sys: 241 ms, total: 6.53 s
Wall time: 6.56 s


In [19]:
import gc
import sys

def get_obj_size(obj):
    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))

        # Lookup all the object referred to by the object in obj_q.
        # See: https://docs.python.org/3.7/library/gc.html#gc.get_referents
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))

        # Filter object that are already marked.
        # Using dict notation will prevent repeated objects.
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}

        # The new obj_q will be the ones that were not marked,
        # and we will update marked with their ids so we will
        # not traverse them again.
        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz

In [20]:
print("Size of the Model : " + str(get_obj_size(X)) + " bytes")

Size of the Model : 55420 bytes


In [21]:
rating = pd.read_csv("rating.csv",
                     names=["userId", "movieId", "rating"], header=None)
def simple_cache(X, movie_mapper, movie_inv_mapper, user_id):
    movie_id = rating[rating['userId'] == user_id].iloc[-1]['movieId']
    similar_ids = find_similar_movies(movie_id, X, 50, movie_mapper, movie_inv_mapper)
    rec = ",".join(similar_ids)
    return rec

In [22]:
test_df = rating
test_df = test_df.filter(['userId', 'movieId', 'rating']).query("rating >= 5")
test_df = test_df.groupby('userId').agg({'movieId':lambda x: set(x)}).reset_index()

def f(x):
    """
    lambda function
    :param x:
    """
    movieIds = x['movieId']
    if len(movieIds) >= 10:
        return 'no'
    else:
        return 'yes'

test_df['drop'] = test_df.apply(lambda x: f(x), axis = 1)

test_df = test_df.filter(['userId', 'movieId', 'drop']).query("drop == 'no'")
test_df.head()
print(len(test_df))

3027


In [23]:

%%time
test_df = rating.sample(n = 1000)
def f(x):
    """
    lambda function
    :param x:
    """
    user_id = x['userId']
    movieIds = list(x['movieId'])
    rec = simple_cache(X, movie_mapper, movie_inv_mapper, user_id)
    for m in movieIds:
        if m in rec:
            return "yes"
        else:
            return "no"


test_df['yes_no'] = test_df.apply(lambda x: f(x), axis = 1)
test_df.head()


test_df.head()



CPU times: user 3min 26s, sys: 20.3 s, total: 3min 46s
Wall time: 3min 40s


Unnamed: 0,userId,movieId,rating,yes_no
1782981,665406,austin+powers+the+spy+who+shagged+me+1999,3,yes
920129,3767,jerry+maguire+1996,2,yes
1047713,444572,wild+wild+west+1999,3,yes
1092892,836453,102+dalmatians+2000,1,yes
690828,179007,escape+from+new+york+1981,5,yes


In [24]:
yes_df = test_df.filter(['userId', 'movieId', 'rating', 'yes_no']).query("yes_no == 'yes'")
no_df = test_df.filter(['userId', 'movieId', 'rating', 'yes_no']).query("yes_no == 'no'")

In [2]:
acc = len(yes_df) / (len(yes_df) + len(no_df))

print("Accuracy = " + str(acc))

Accuracy = 0.86


In [14]:
%%time

!curl http://128.2.205.123:8082/recommend/163627

the+shawshank+redemption+1994,the+silence+of+the+lambs+1991,forrest+gump+1994,pulp+fiction+1994,the+usual+suspects+1995,dances+with+wolves+1990,apollo+13+1995,the+fugitive+1993,jurassic+park+1993,braveheart+1995,pretty+woman+1990,terminator+2+judgment+day+1991,speed+1994,the+godfather+1972,batman+1989,ghost+1990,sleepless+in+seattle+1993,four+weddings+and+a+funeral+1994,the+lion+king+1994,mrs.+doubtfire+1993CPU times: user 4.2 ms, sys: 10.2 ms, total: 14.4 ms
Wall time: 159 ms
