In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
import sys
sys.executable

'/home/ayeghiazaryan/anaconda3/envs/my_conda_env/bin/python3'

In [3]:
ratings = pd.read_csv('./ml-1m/ratings.dat', names = ['userId','movieId','rating','timestamp'], delimiter='::')
movies = pd.read_csv('./ml-1m/movies.dat', names = ['movieId','title','genres'], delimiter='::')
users = pd.read_csv('./ml-1m/users.dat', names = ['userId','gender','age','occupation','zip-code'], delimiter='::')

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.movieId.nunique(), ratings.userId.nunique()

(3706, 6040)

In [8]:
ratings.shape, movies.shape

((1000209, 4), (3883, 3))

In [9]:
from lightfm.data import Dataset

In [12]:
#create a Dataset object, which is compatible with lightfm
dataset = Dataset()
dataset.fit((x for x in ratings.userId),
            (x for x in ratings.movieId))

In [13]:
#shape of the dataset, should be (n_users, n_movies)
dataset.interactions_shape()

(6040, 3706)

In [14]:
#build the interactions in the Dataset object.
#simply feed the (user,movie) tuple for which interactions took place
#'weights' can be ignored for the current dataset
(interactions, weights) = dataset.build_interactions(((row[0], row[1])
                                                      for row in np.array(ratings[['userId', 'movieId']])))

In [15]:
interactions

<6040x3706 sparse matrix of type '<class 'numpy.int32'>'
	with 1000209 stored elements in COOrdinate format>

In [16]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split

In [17]:
#split the data into train and test sets
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(777))

In [20]:
from lightfm import LightFM

In [21]:
#fit the model with bpr loss
model = LightFM(loss='bpr')
model.fit(interactions, epochs=10)

<lightfm.lightfm.LightFM at 0x7f6c097f19b0>

In [23]:
from lightfm.evaluation import precision_at_k

In [24]:
#calculate precision at k=10 for train and test sets
#precision at k basically is the fraction of known positives in the first k positions of the ranked list of predictions
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

Precision: train 0.54, test 0.13.
