In [1]:
import pickle
import json
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from utils import MOVIES, MOVIES_DF, RATINGS, RATINGS_DF

In [None]:
len(RATINGS.columns.unique())

In [6]:
RATINGS

Unnamed: 0,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.5,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.5,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.5,0.0,0.0,4.0,4.5,0.0,0.0,4.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def build_model_neighbors(metric: str = "cosine", n_jobs: int = -1) -> str:
    """
    Function to build and save a recommender model using Nearest Neighbors.
    """
    # Initialize the NearestNeighbors model
    model = NearestNeighbors(metric=metric, n_jobs=n_jobs)
    print(
        "Nearest Neighbors model instantiated with following hyperparameters:\n"
        f"metric={metric}\n"
        f"n_jobs={n_jobs}\n\n"
        "Starting to fit.\n"
    )

    # Fit it to the Ratings matrix
    model.fit(RATINGS)

    # Print reconstruction error
    print("Nearest neighbor model built.")

    # Save model
    file_name = "./models/similarity_model.pkl"
    
    with open(file_name, "wb") as file:
        pickle.dump(model, file)

    return file_name


build_model_neighbors()

In [None]:
user_query = json.load(open("user_query.json"))


df_query = pd.DataFrame(
        user_query, columns=MOVIES_DF['movieId'].unique(),
        index=[0]
    )

In [None]:
df_query.fillna(0)

In [None]:
RATINGS_DF.movieId.values

In [2]:
from utils import cos_sim_model

In [25]:
# Calculate the distances to other users
user_query = json.load(open("user_query.json"))
user_query.keys()

dict_keys(['Toy Story (1995)', 'Bottle Rocket (1996)', 'From Dusk Till Dawn (1996)', 'Tombstone (1993)', 'Grumpier Old Men (1995)'])

In [4]:
user_dataframe = pd.DataFrame(user_query, columns=MOVIES, index=["new_user"])
user_dataframe = user_dataframe.fillna(0)
user_dataframe

Unnamed: 0,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,3,4,0,0,0,5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
similarity_scores, neighbor_ids = cos_sim_model.kneighbors(
    user_dataframe, n_neighbors=5, return_distance=True
)

similarity_scores, neighbor_ids

(array([[0.76485443, 0.81488994, 0.81981251, 0.82115131, 0.82150101]]),
 array([[ 95, 186, 397, 127,  96]]))

In [11]:
user_dataframe.columns[neighbor_ids[0]]

Index(['Rocky (1976)', 'Dark Knight, The (2008)', 'Last Action Hero (1993)',
       'Logan's Run (1976)', 'Labyrinth (1986)'],
      dtype='object')

In [12]:
# Save ids and scores in a DataFrame and sort it
df_neighbors = pd.DataFrame(
    data={
        "neighbor_id": neighbor_ids[0],
        "neighbor_title": user_dataframe.columns[neighbor_ids[0]],
        "similarity_score": similarity_scores[0],
    }
)

df_neighbors

Unnamed: 0,neighbor_id,neighbor_title,similarity_score
0,95,Rocky (1976),0.764854
1,186,"Dark Knight, The (2008)",0.81489
2,397,Last Action Hero (1993),0.819813
3,127,Logan's Run (1976),0.821151
4,96,Labyrinth (1986),0.821501


In [13]:
df_neighbors.sort_values("similarity_score", ascending=False, inplace=True)

In [14]:
df_neighbors

Unnamed: 0,neighbor_id,neighbor_title,similarity_score
4,96,Labyrinth (1986),0.821501
3,127,Logan's Run (1976),0.821151
2,397,Last Action Hero (1993),0.819813
1,186,"Dark Knight, The (2008)",0.81489
0,95,Rocky (1976),0.764854


In [26]:
# Calculate CSR Matrix (R) and convert do Dataframe
df_r = pd.read_csv('data/user_item_matrix.csv', index_col=0)


Unnamed: 0,Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",Braveheart (1995),Rob Roy (1995),Desperado (1995),Billy Madison (1995),Clerks (1994),Dumb & Dumber (Dumb and Dumber) (1994),Ed Wood (1994),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
95,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127,4.0,5.0,0.0,3.0,0.0,3.0,3.0,5.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Filter to only show similar users and filter out movies rated by the user
neighborhood_filtered = df_r.iloc[neighbor_ids[0]].drop(user_query.keys(), axis=1)

# Multiply the ratings with the similarity score of each user and
# calculate the summed up rating for each movie
df_score = neighborhood_filtered.apply(
    lambda x: df_neighbors.set_index("neighbor_id").loc[x.index][
        "similarity_score"
    ]
    * x
)
df_score_ranked = (
    df_score.sum(axis=0).reset_index().sort_values(0, ascending=False)
)
df_score_ranked.reset_index(drop=True, inplace=True)

df_score_ranked.iloc[:10, 0]

0    Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
1         Independence Day (a.k.a. ID4) (1996)
2                     Leaving Las Vegas (1995)
3                                  Heat (1995)
4                    Mr. Holland's Opus (1995)
5                   Mission: Impossible (1996)
6                               Twister (1996)
7              Star Trek: First Contact (1996)
8           Father of the Bride Part II (1995)
9                          Broken Arrow (1996)
Name: index, dtype: object