In [1]:
%load_ext autoreload
%autoreload 2

import logging
import numpy as np
import os
import pandas as pd
import scrapbook as sb
from sklearn.preprocessing import minmax_scale

from recommenders.utils.python_utils import binarize
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.datasets.pandas_df_utils import filter_by, negative_feedback_sampler
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    rmse,
    mae,
    logloss,
    rsquared,
    exp_var
)
from recommenders.models.sar import SAR
import sys

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
Pandas version: 1.4.1


In [2]:
# top k items to recommend
TOP_K = 10

In [35]:
COL_USER = "userID"
COL_ITEM = "itemID"
COL_RATING = "rating"
COL_PREDICTION = "rating"
COL_TIMESTAMP = "timestamp"

root_dir = '/opt/ml/input/data/train/'
data = pd.read_csv(os.path.join(root_dir,'train_ratings.csv'), names=[COL_USER, COL_ITEM, COL_TIMESTAMP], header=0)
data.head()

Unnamed: 0,userID,itemID,timestamp
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [27]:
negative_sampling = negative_feedback_sampler(data[[COL_USER, COL_ITEM]], col_user=COL_USER, col_item=COL_ITEM)
negative_sampling.head()

Unnamed: 0,userID,itemID,feedback
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1


In [36]:
df = negative_sampling
# data['rating'] = 1
df.rename(columns = {"feedback": "rating"}, inplace=True)
df

Unnamed: 0,userID,itemID,rating
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1
...,...,...,...
10308937,138493,7114,0
10308938,138493,7121,0
10308939,138493,7132,0
10308940,138493,7143,0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10308942 entries, 0 to 10308941
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   userID  int64
 1   itemID  int64
 2   rating  int64
dtypes: int64(3)
memory usage: 236.0 MB


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5154471 entries, 0 to 5154470
Data columns (total 3 columns):
 #   Column     Dtype
---  ------     -----
 0   userID     int64
 1   itemID     int64
 2   timestamp  int64
dtypes: int64(3)
memory usage: 118.0 MB


In [41]:
pd.concat([df, data], axis=1, joint_axes=["itemID"])

TypeError: concat() got an unexpected keyword argument 'joint_axes'

In [44]:
pd.merge(df,data,on="itemID", how="inner")

MemoryError: Unable to allocate 203. GiB for an array with shape (27240215329,) and data type int64

In [37]:
df.join(data, on="itemID", how="left")

ValueError: columns overlap but no suffix specified: Index(['userID', 'itemID'], dtype='object')

In [48]:
data['timestamp'] = 1

In [50]:
train, test = python_stratified_split(data, ratio=0.75, col_user='userID', col_item='itemID', seed=42)

In [51]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train['userID'].unique()),
    train_items=len(train['itemID'].unique()),
    test_total=len(test),
    test_users=len(test['userID'].unique()),
    test_items=len(test['itemID'].unique()),
))


Train:
Total Ratings: 7731662
Unique Users: 31360
Unique Items: 6807

Test:
Total Ratings: 2577280
Unique Users: 31360
Unique Items: 6807



# Train SAR Model

In [52]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_timestamp="timestamp",
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    timedecay_formula=True,
    normalize=True
)

In [53]:
with Timer() as train_time:
    model.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

2022-04-06 09:32:21,045 INFO     Collecting user affinity matrix
2022-04-06 09:32:21,293 INFO     Calculating time-decayed affinities
2022-04-06 09:32:25,615 INFO     Creating index columns
2022-04-06 09:32:35,672 INFO     Calculating normalization factors
2022-04-06 09:32:41,455 INFO     Building user affinity sparse matrix
2022-04-06 09:32:41,951 INFO     Calculating item co-occurrence
2022-04-06 09:33:00,953 INFO     Calculating item similarity
2022-04-06 09:33:00,955 INFO     Using jaccard based similarity
2022-04-06 09:33:02,620 INFO     Done training


Took 41.69205437367782 seconds for training.


In [54]:
with Timer() as test_time:
    top_k = model.recommend_k_items(test, remove_seen=True)

print("Took {} seconds for prediction.".format(test_time.interval))

2022-04-06 09:33:14,195 INFO     Calculating recommendation scores
2022-04-06 09:35:25,757 INFO     Removing seen items
  return self._with_data(self.data * other)


Took 134.6099784760736 seconds for prediction.


In [55]:
top_k.head()

Unnamed: 0,userID,itemID,prediction


In [56]:
top_k['prediction'].nunique()

0

# Evaluation

In [18]:
eval_recall = recall_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [19]:
print(eval_recall)

0.0007166686212385543
