In [39]:
import sys
import os

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k
from recommenders.datasets.pandas_df_utils import filter_by, negative_feedback_sampler

# Import repo's evaluation metrics
from recommenders.evaluation.python_evaluation import (
    precision_at_k, recall_at_k)

from recommenders.utils.timer import Timer
from recommenders.models.lightfm.lightfm_utils import (
    track_model_metrics, prepare_test_df, prepare_all_predictions,
    compare_metric, similar_users, similar_items)

print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))

System version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
LightFM version: 1.16


In [40]:
# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 8
# regularisation for both user and item features
ITEM_ALPHA=1e-6
USER_ALPHA=1e-6

# seed for pseudonumber generations
SEEDNO = 42

In [44]:
COL_USER = "userID"
COL_ITEM = "itemID"
COL_RATING = "rating"
COL_PREDICTION = "rating"
COL_TIMESTAMP = "timestamp"

root_dir = '/opt/ml/input/data/train/'
df = pd.read_csv(os.path.join(root_dir,'train_ratings.csv'), names=[COL_USER, COL_ITEM, COL_TIMESTAMP], header=0)
df.head()

Unnamed: 0,userID,itemID,timestamp
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [45]:
negative_sampling = negative_feedback_sampler(df[[COL_USER, COL_ITEM]], col_user=COL_USER, col_item=COL_ITEM)
negative_sampling.head()

Unnamed: 0,userID,itemID,feedback
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1


In [46]:
years = pd.read_csv('./years_fixed.tsv', sep='\t', names=[COL_ITEM, 'year'], header=0)
years

Unnamed: 0,itemID,year
0,1348,1922
1,44587,1922
2,4768,1922
3,8235,1923
4,8609,1923
...,...,...
6802,7065,1915
6803,7243,1916
6804,8511,1917
6805,32898,1902


In [56]:
df = negative_sampling.rename({'feedback' :'rating'}, axis=1)

In [57]:
data = pd.merge(df, years, how='left', on="itemID")
data = data[['userID', 'itemID', 'rating', 'year']]
data

Unnamed: 0,userID,itemID,rating,year
0,11,4643,1,2001
1,11,170,1,1995
2,11,531,1,1993
3,11,616,1,1970
4,11,2140,1,1982
...,...,...,...,...
10308937,138493,7114,0,1965
10308938,138493,7121,0,1949
10308939,138493,7132,0,1935
10308940,138493,7143,0,2003


## Prepare data

In [58]:
dataset = Dataset()

In [59]:
dataset.fit(users=data['userID'], 
            items=data['itemID'])

# quick check to determine the number of unique users and items in the data
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

Num users: 31360, num_topics: 6807.


In [60]:
(interactions, weights) = dataset.build_interactions(data.iloc[:, 0:3].values)

In [61]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEEDNO))

In [62]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (31360, 6807)
Shape of test interactions: (31360, 6807)


## LightFM model

In [63]:
interactions

<31360x6807 sparse matrix of type '<class 'numpy.int32'>'
	with 10308942 stored elements in COOrdinate format>

In [64]:
model1 = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEEDNO))

In [65]:
model1.fit(interactions=train_interactions,
          epochs=NO_EPOCHS)

<lightfm.lightfm.LightFM at 0x7f7c06ab6a00>

## Prepare model evaluation data

In [66]:
uids, iids, interaction_data = cross_validation._shuffle(
    interactions.row, interactions.col, interactions.data, 
    random_state=np.random.RandomState(SEEDNO))

cutoff = int((1.0 - TEST_PERCENTAGE) * len(uids))
test_idx = slice(cutoff, None)

In [67]:
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping()

In [68]:
with Timer() as test_time:
    test_df = prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights)
print(f"Took {test_time.interval:.1f} seconds for prepare and predict test data.")  
time_reco1 = test_time.interval

KeyboardInterrupt: 

In [None]:
test_df.sample(5)

In [26]:
test_df

Unnamed: 0,userID,itemID,rating
0,26766,56801,1.0
1,124661,4963,1.0
2,76281,911,1.0
3,86783,31878,1.0
4,91620,19,1.0
...,...,...,...
1288613,41499,59258,1.0
1288614,59324,2000,1.0
1288615,131961,5959,1.0
1288616,115044,597,1.0


# Prepare

In [27]:
users, items, preds = [], [], []

In [28]:
item = list(data.itemID.unique())

In [29]:
for user in data.userID.unique():
    user = [user] * len(item)
    users.extend(user)
    items.extend(item)

In [30]:
len(users)

213467520

In [32]:
all_predictions = pd.DataFrame(data={"userID": users, "itemID": items})
all_predictions["uid"] = all_predictions.userID.map(uid_map)
all_predictions["iid"] = all_predictions.itemID.map(iid_map)

KeyboardInterrupt: 

# recommend_k

## Evaluation

In [69]:
with Timer() as test_time:
    eval_recall_lfm = lightfm_recall_at_k(model1, test_interactions, 
                                          train_interactions, k=K).mean()
time_lfm = test_time.interval
    
print(
    f"Recall@K:\t{eval_recall_lfm:.6f}", 
    sep='\n')

Recall@K:	0.001495
