In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
import sys
sys.executable

'/home/ayeghiazaryan/anaconda3/envs/my_conda_env/bin/python3'

In [3]:
ratings = pd.read_csv('./ml-1m/ratings.dat', names = ['userId','movieId','rating','timestamp'], delimiter='::')
movies = pd.read_csv('./ml-1m/movies.dat', names = ['movieId','title','genres'], delimiter='::')
users = pd.read_csv('./ml-1m/users.dat', names = ['userId','gender','age','occupation','zip-code'], delimiter='::')

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.movieId.nunique(), ratings.userId.nunique()

(3706, 6040)

In [8]:
ratings.shape, movies.shape

((1000209, 4), (3883, 3))

In [92]:
from lightfm.data import Dataset

In [93]:
dataset = Dataset()

In [94]:
dataset.fit((x for x in ratings.userId),
            (x for x in ratings.movieId), user_features=(str(x) for x in users.occupation))

In [95]:
dataset.interactions_shape()occupation

(6040, 3706)

In [96]:
(interactions, weights) = dataset.build_interactions((row[0], row[1])
                                                      for row in np.array(ratings[['userId', 'movieId']]))

In [97]:
interactions

<6040x3706 sparse matrix of type '<class 'numpy.int32'>'
	with 1000209 stored elements in COOrdinate format>

In [98]:
# user_features = [(i[0], [users['gender'][i[0]-1]]) for i in ratings.values]

In [99]:
# dataset.build_user_features(user_features)

In [100]:
user_features = dataset.build_user_features((x[0], [str(x[1])])
                                              for x in np.array(users[['userId', 'occupation']]))

In [101]:
dataset.user_features_shape()

(6040, 6061)

In [102]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split

In [103]:
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(777))

In [104]:
train, test

(<6040x3706 sparse matrix of type '<class 'numpy.int32'>'
 	with 800167 stored elements in COOrdinate format>,
 <6040x3706 sparse matrix of type '<class 'numpy.int32'>'
 	with 200042 stored elements in COOrdinate format>)

In [111]:
model = LightFM(loss='bpr', learning_rate=0.03, no_components=30)
model.fit(interactions, user_features=user_features, epochs=20)

<lightfm.lightfm.LightFM at 0x7fc14fab7390>

In [112]:
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

In [113]:
train_precision = precision_at_k(model, train, k=10, user_features=user_features).mean()
test_precision = precision_at_k(model, test, k=10, user_features=user_features).mean()

train_auc = auc_score(model, train, user_features=user_features).mean()
test_auc = auc_score(model, test, user_features=user_features).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.50, test 0.13.
AUC: train 0.88, test 0.87.


In [114]:
model = LightFM(loss='bpr', learning_rate=0.03, no_components=30)
model.fit(interactions, epochs=20)

<lightfm.lightfm.LightFM at 0x7fc14fb55710>

In [115]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.55, test 0.14.
AUC: train 0.89, test 0.88.


In [None]:
#copied from demo notebook without user/item features

In [58]:
movie_id_to_enum_dict = dataset.mapping()[2]
movie_enum_to_id_dict = {v:k for k,v in movie_id_to_enum_dict.items()}

In [52]:
movie_factors = model.get_item_representations()[1]

In [59]:
def closest_movies_to(movie_name):
    movie_id = movies[movies.title==movie_name]
    if len(movie_id) == 0: return 'No such movie found'
    ix = movie_id.movieId.iloc[0]
    movie_enum = movie_id_to_enum_dict[ix]
    
    movie_factor = movie_factors[movie_enum]
        
    norms = np.linalg.norm(movie_factors - movie_factor, axis=1)
    similar_enums = np.argsort(norms)[1:4]
    similar_ids = [movie_enum_to_id_dict[k] for k in similar_enums]
    
    
    return movies[movies.movieId.isin(similar_ids)]

In [61]:
closest_movies_to('Jumanji (1995)')

Unnamed: 0,movieId,title,genres
179,181,Mighty Morphin Power Rangers: The Movie (1995),Action|Children's
3420,3489,Hook (1991),Adventure|Fantasy
3807,3877,Supergirl (1984),Action|Adventure|Fantasy
