In [33]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import time
import gc
import sys

In [8]:
def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


def find_similar_movies(movie_id, X, k, movie_mapper, movie_inv_mapper, metric='cosine', show_distance=False):
    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k += 1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1, -1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0, k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [55]:
def separate_train_test(movie_db):
    train, test = train_test_split(movie_db, train_size=0.8, test_size=0.2)
    return train, test

In [56]:
def get_obj_size(obj):
    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}
        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz

In [11]:
def train(train_data):
    ratings = train_data
    X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper


In [57]:
train_start_time = time.time()
rating =  pd.read_csv("data/rating.csv", names= ["userId", "movieId", "rating"] , header = None)
db_train, db_test = separate_train_test(rating)
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = train(db_train)
train_end_time = time.time()

In [58]:
db_test

Unnamed: 0,userId,movieId,rating
1020243,138940,pulp+fiction+1994,3
2003020,796543,desperado+1995,5
506509,736202,braindead+1992,3
1798342,909618,leaving+las+vegas+1995,4
1599870,381636,disclosure+1994,3
...,...,...,...
1301979,759250,toy+story+1995,5
1710192,913591,ray+2004,4
1206819,819948,romancing+the+stone+1984,5
1598297,572704,the+net+1995,5


In [67]:
test_df = db_test.filter(['userId', 'movieId', 'rating']).query("rating >= 5")
test_df = test_df.groupby('userId').agg({'movieId':lambda x: set(x)}).reset_index()

def f(x):
    """
    lambda function
    :param x:
    """
    movieIds = x['movieId']
    if len(movieIds) >= 4:
        return 'no'
    else:
        return 'yes'

test_df['drop'] = test_df.apply(lambda x: f(x), axis = 1)

test_df = test_df.filter(['userId', 'movieId', 'drop']).query("drop == 'no'")

In [68]:
def simple_cache(X, movie_mapper, movie_inv_mapper, user_id):
    movie_id = rating[rating['userId'] == user_id].iloc[-1]['movieId']
    similar_ids = find_similar_movies(movie_id, X, 50, movie_mapper, movie_inv_mapper)
    rec = ",".join(similar_ids)
    return rec

In [71]:
def f(x):
    """
    lambda function
    :param x:
    """
    try:
        user_id = x['userId']
        movieIds = list(x['movieId'])
        rec = simple_cache(X, movie_mapper, movie_inv_mapper, user_id)
        for m in movieIds:
            if m in rec:
                return "yes"
            else:
                return "no"
    except Exception:
        print("Exception")
        return "no"

test_start_time  = time.time()
test_df['yes_no'] = test_df.apply(lambda x: f(x), axis = 1)
test_df.head()


test_df.head()

Exception


Unnamed: 0,userId,movieId,drop,yes_no
126,1809,"{get+shorty+1995, rudy+1993, goldfinger+1964, ...",no,no
145,2174,"{tombstone+1993, casablanca+1942, swiss+family...",no,no
217,3156,{the+lord+of+the+rings+the+fellowship+of+the+r...,no,no
248,3800,"{blade+runner+1982, modern+times+1936, thirty+...",no,no
296,4651,"{no+country+for+old+men+2007, for+a+few+dollar...",no,no


In [80]:
misses = len(test_df.filter(['userId', 'movieId', 'rating', 'yes_no']).query("yes_no == 'yes'"))
hits = len(test_df.filter(['userId', 'movieId', 'rating', 'yes_no']).query("yes_no == 'no'"))
test_end_time = time.time()

print("Offline evaluation completed.")
print("Number of filtered testing sample : {}".format(len(test_df)))
print(
    "Ratio of train : test = {}% : {}%".format(round(len(db_train) * 100 / (len(db_train) + len(db_test)), 2),
                                               round((len(db_test) * 100 / (len(db_train) + len(db_test))), 2)))
print("Size of the Model : " + str(get_obj_size(X)) + " bytes")
print("Training Latency : {} seconds".format(train_end_time - train_start_time))
print("Testing Latency : {} seconds".format(test_end_time - test_start_time))
print("Number of hits : {}".format(hits))
print("Number of misses : {}".format(misses))
print("Hit Rate = {}%".format(round(hits * 100 / (misses + hits)), 2))
print("Miss Rate = {}%".format(round(misses * 100 / (misses + hits)), 2))

Offline evaluation completed.
Number of filtered testing sample : 1442
Ratio of train : test = 80.0% : 20.0%
Size of the Model : 53320 bytes
Training Latency : 6.419741868972778 seconds
Testing Latency : 1076.5595302581787 seconds
Number of hits : 1220
Number of misses : 222
Hit Rate = 85%
Miss Rate = 15%


In [82]:
# Offline evaluation completed.
# Number of filtered testing sample : 1442
# Ratio of train : test = 80.0% : 20.0%
# Size of the Model : 53320 bytes
# Training Latency : 6.419741868972778 seconds
# Testing Latency : 1076.5595302581787 seconds
# Number of hits : 1220
# Number of misses : 222
# Hit Rate = 85%
# Miss Rate = 15%