In [1]:
import sys
import os
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
from preprocessing import *
from IPython.display import HTML



In [2]:
ratings = pd.read_csv('./datasets/ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv('./datasets/ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
def create_interaction_matrix(df, user_col, item_col, rating_col, norm=False, threshold=None):
    interactions = df.groupby([user_col, item_col])[rating_col].sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > thresholdshold else 0)
    return interactions

In [5]:
interactions = create_interaction_matrix(df = ratings,
                                          user_col = 'userId',
                                          item_col = 'movieId',
                                          rating_col = 'rating',
                                          threshold = '3')
interactions.shape

(610, 9724)

In [6]:
interactions.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
def create_user_dict(interactions):
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

In [8]:
user_dict = create_user_dict(interactions=interactions)

In [9]:
def create_item_dict(df, id_col, name_col):
    item_dict = {}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i, id_col])] = df.loc[i, name_col]
    return item_dict

In [10]:
movies_dict = create_item_dict(df = movies,
                               id_col = 'movieId',
                               name_col = 'title')

In [11]:
def run_mf(interactions, n_components=30, loss='warp', k=15, epoch=30, n_jobs=12):
    x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components=n_components, loss=loss, k=k)
    model.fit(x, epochs=epoch, num_threads=n_jobs)
    return model

In [12]:
mf_model = run_mf(interactions = interactions,
                  n_components = 30,
                  loss = 'warp',
                  k = 15,
                  epoch = 30,
                  n_jobs = 12)

In [13]:
def sample_recomendation_user(model, interactions, user_id, user_dict,
                             item_dict, threshold=0, nrec_items=10, show=True):
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id, :] \
                                 [interactions.loc[user_id, :] > threshold].index).sort_values(ascending=False))
        
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0: nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print('Known likes:')
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter += 1
                                
        print('\n Recomended items:')                          
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter += 1
    return return_score_list

In [14]:
rec_list = sample_recomendation_user(model = mf_model,
                                     interactions = interactions,
                                     user_id = 10,
                                     user_dict = user_dict,
                                     item_dict = movies_dict,
                                     threshold = 4,
                                     nrec_items = 10)

Known likes:
1- The Intern (2015)
2- Spectre (2015)
3- The Hundred-Foot Journey (2014)
4- Frozen (2013)
5- Skyfall (2012)
6- Best Exotic Marigold Hotel, The (2011)
7- Intouchables (2011)
8- Dark Knight Rises, The (2012)
9- King's Speech, The (2010)
10- Despicable Me (2010)
11- Education, An (2009)
12- Proposal, The (2009)
13- Twilight (2008)
14- Dark Knight, The (2008)
15- Priceless (Hors de prix) (2006)
16- Holiday, The (2006)
17- Casino Royale (2006)
18- Batman Begins (2005)
19- Hitch (2005)
20- First Daughter (2004)
21- Notebook, The (2004)
22- Troy (2004)
23- Chasing Liberty (2004)
24- Legally Blonde (2001)
25- Shrek (2001)

 Recomended items:
1- Love Actually (2003)
2- Monsters, Inc. (2001)
3- Harry Potter and the Deathly Hallows: Part 2 (2011)
4- Tangled (2010)
5- Up (2009)
6- Pirates of the Caribbean: The Curse of the Black Pearl (2003)
7- Devil Wears Prada, The (2006)
8- Finding Nemo (2003)
9- 27 Dresses (2008)
10- Walk to Remember, A (2002)


In [15]:
def sample_recomendation_item(model, interactions, item_id, user_dict, item_dict, number_of_user):
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id), n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list

In [16]:
sample_recomendation_item(model = mf_model,
                          interactions = interactions,
                          item_id = 1,
                          user_dict = user_dict,
                          item_dict = movies_dict,
                          number_of_user = 15)

[557, 399, 507, 423, 347, 379, 512, 8, 569, 340, 394, 565, 498, 455, 170]

In [17]:
def create_item_embedding_distance_matrix(model, interactions):
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_embeddings_distance_matrix = pd.DataFrame(similarities)
    item_embeddings_distance_matrix.columns = interactions.columns
    item_embeddings_distance_matrix.index = interactions.columns
    return item_embeddings_distance_matrix

In [18]:
item_item_dist = create_item_embedding_distance_matrix(model = mf_model,
                                                        interactions = interactions)

In [19]:
def item_item_recommendation(item_embedding_distance_matrix, item_id, item_dict, n_items = 10, show = True):
    recommended_items = list(pd.Series(item_embedding_distance_matrix.loc[item_id, :]. \
                                      sort_values(ascending=False).head(n_items+1). \
                                      index[1:n_items+1]))
    if show == True:
        print(f'Item of interest : {item_dict[item_id]}')
        print(f'Item similar to the above item:')
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' + item_dict[i])
            counter += 1
    return recommended_items

In [20]:
rec_list = item_item_recommendation(item_embedding_distance_matrix = item_item_dist,
                                    item_id = 1,
                                    item_dict = movies_dict,
                                    n_items = 10)

Item of interest : Toy Story (1995)
Item similar to the above item:
1- Forrest Gump (1994)
2- Babe (1995)
3- Jurassic Park (1993)
4- Aladdin (1992)
5- Mrs. Doubtfire (1993)
6- Back to the Future (1985)
7- Star Wars: Episode IV - A New Hope (1977)
8- Liar Liar (1997)
9- Nightmare Before Christmas, The (1993)
10- Home Alone (1990)
