In [1]:
%load_ext autoreload
%autoreload 2

import logging
import numpy as np
import pandas as pd
import scrapbook as sb
from sklearn.preprocessing import minmax_scale

from recommenders.utils.python_utils import binarize
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    rmse,
    mae,
    logloss,
    rsquared,
    exp_var
)
from recommenders.models.sar import SAR
import sys

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
Pandas version: 1.0.1


In [2]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
#MOVIELENS_DATA_SIZE = '100k'

In [3]:
df = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")
df["rating"] = 1
df = df[["user", "item", "rating","time"]]
df.columns = ['userID', 'itemID', 'rating',"timestamp"]
df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,11,4643,1,1230782529
1,11,170,1,1230782534
2,11,531,1,1230782539
3,11,616,1,1230782542
4,11,2140,1,1230782563


In [6]:
df['rating'] = df['rating'].astype(np.float32)

df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,11,4643,1.0,1230782529
1,11,170,1.0,1230782534
2,11,531,1.0,1230782539
3,11,616,1.0,1230782542
4,11,2140,1.0,1230782563


In [8]:
train, test = python_stratified_split(df, ratio=0.75, col_user='userID', col_item='itemID', seed=42)

In [9]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train['userID'].unique()),
    train_items=len(train['itemID'].unique()),
    test_total=len(test),
    test_users=len(test['userID'].unique()),
    test_items=len(test['itemID'].unique()),
))


Train:
Total Ratings: 3865715
Unique Users: 31360
Unique Items: 6807

Test:
Total Ratings: 1288756
Unique Users: 31360
Unique Items: 6807



In [10]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_timestamp="timestamp",
    similarity_type="jaccard",
    time_decay_coefficient=30, 
    timedecay_formula=True,
    normalize=True
)

In [11]:
with Timer() as train_time:
    model.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

2022-04-07 04:53:58,280 INFO     Collecting user affinity matrix
2022-04-07 04:53:58,648 INFO     Calculating time-decayed affinities
2022-04-07 04:54:01,653 INFO     Creating index columns
2022-04-07 04:54:08,470 INFO     Calculating normalization factors
2022-04-07 04:54:17,376 INFO     Building user affinity sparse matrix
2022-04-07 04:54:18,117 INFO     Calculating item co-occurrence
2022-04-07 04:54:30,445 INFO     Calculating item similarity
2022-04-07 04:54:30,459 INFO     Using jaccard based similarity
2022-04-07 04:54:38,795 INFO     Done training


Took 40.63527191497269 seconds for training.


In [12]:
with Timer() as test_time:
    top_k = model.recommend_k_items(test, remove_seen=True)

print("Took {} seconds for prediction.".format(test_time.interval))

2022-04-07 04:54:38,853 INFO     Calculating recommendation scores
2022-04-07 04:56:20,212 INFO     Removing seen items


Took 107.15762954100501 seconds for prediction.


In [13]:
top_k.head()

Unnamed: 0,userID,itemID,prediction
0,11,48516,0.573137
1,11,58559,0.564522
2,11,48780,0.561596
3,11,49272,0.548742
4,11,44191,0.547767


In [16]:
top_k.shape

(313600, 3)

In [18]:
top_k.to_csv("/opt/ml/input/code/output/SAR.csv", index=False)

In [14]:
eval_recall = recall_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=TOP_K)

In [19]:
print("Model:\t",
      "Top K:\t%d" % TOP_K,
      "Recall@K:\t%f" % eval_recall,
      sep='\n')

Model:	
Top K:	10
Recall@K:	0.098125


In [30]:
top_k_result=top_k.copy()

In [31]:
top_k_result

Unnamed: 0,userID,itemID,prediction
0,11,48516,0.573137
1,11,58559,0.564522
2,11,48780,0.561596
3,11,49272,0.548742
4,11,44191,0.547767
...,...,...,...
313595,138493,6539,0.898386
313596,138493,2762,0.895871
313597,138493,1580,0.895684
313598,138493,1,0.892493


In [32]:
del top_k_result['prediction']

In [33]:
top_k_result

Unnamed: 0,userID,itemID
0,11,48516
1,11,58559
2,11,48780
3,11,49272
4,11,44191
...,...,...
313595,138493,6539
313596,138493,2762
313597,138493,1580
313598,138493,1


In [34]:
top_k_result.to_csv("/opt/ml/input/code/output/SAR.csv", index=False)

In [35]:
top_k_result.shape

(313600, 2)