# SAR

In [92]:
%%capture
import sys

import itertools
import logging
import os

import numpy as np
import pandas as pd
!pip install papermill
import papermill as pm

from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.sar import SAR

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

In [93]:
TOP_K = 10

In [94]:
path = "https://5190-hav-recommendation-data.s3.us-east-1.amazonaws.com/ratings_Electronics.csv"
data = pd.read_csv(path, header=None)

data.columns = ['userId', 'productId', 'rating', 'timestamp']
data
#takes 10s or so

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,0132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,0321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,0439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,0439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,0439886341,1.0,1334707200
...,...,...,...,...
7824477,A2YZI3C9MOHC0L,BT008UKTMW,5.0,1396569600
7824478,A322MDK0M89RHN,BT008UKTMW,5.0,1313366400
7824479,A1MH90R0ADMIK0,BT008UKTMW,4.0,1404172800
7824480,A10M2KEFPEQDHN,BT008UKTMW,4.0,1297555200


In [95]:
header = {
    "col_user": "userId",
    "col_item": "productId",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_prediction": "prediction",
}

In [96]:
data.sort_values(by=['timestamp'],inplace=True)

In [None]:
data

In [97]:
user_rating_count = data['userId'].value_counts().rename('user_rating_count')
augmented_amazon_data = data.merge(user_rating_count.to_frame(), left_on='userId', right_index=True)
subset_df = augmented_amazon_data[augmented_amazon_data.user_rating_count >= 50]
print(subset_df.shape)
subset_df.head()
#10s

(125871, 5)


Unnamed: 0,userId,productId,rating,timestamp,user_rating_count
12866,A335QXPTV1RIV1,B00000J4FS,5.0,939600000,55
537820,A335QXPTV1RIV1,B00063ZVJQ,4.0,1136073600,55
1032120,A335QXPTV1RIV1,B000I68BD4,2.0,1175126400,55
196941,A335QXPTV1RIV1,B00006IS50,4.0,1175126400,55
879844,A335QXPTV1RIV1,B000ET7AZK,5.0,1193702400,55


In [98]:
product_rating_counts = subset_df['productId'].value_counts().rename('product_rating_counts')
product_rating_data   = subset_df.merge(product_rating_counts.to_frame(),
                                left_on='productId',
                                right_index=True)
product_rating_data = product_rating_data[product_rating_data.product_rating_counts >= 10]
product_rating_data.head()

Unnamed: 0,userId,productId,rating,timestamp,user_rating_count,product_rating_counts
1032120,A335QXPTV1RIV1,B000I68BD4,2.0,1175126400,55,27
1035226,AT28RL25Q2OLK,B000I68BD4,5.0,1261958400,198,27
1034971,A3NCIN6TNL0MGA,B000I68BD4,4.0,1165017600,87,27
1032939,A1OMXVXXP07F05,B000I68BD4,4.0,1167955200,95,27
1032697,A10ZFE6YE0UHW8,B000I68BD4,4.0,1280707200,130,27


In [99]:
data=product_rating_data

In [100]:
data.drop(columns=['user_rating_count','product_rating_counts'],inplace=True)

In [101]:
data

Unnamed: 0,userId,productId,rating,timestamp
1032120,A335QXPTV1RIV1,B000I68BD4,2.0,1175126400
1035226,AT28RL25Q2OLK,B000I68BD4,5.0,1261958400
1034971,A3NCIN6TNL0MGA,B000I68BD4,4.0,1165017600
1032939,A1OMXVXXP07F05,B000I68BD4,4.0,1167955200
1032697,A10ZFE6YE0UHW8,B000I68BD4,4.0,1280707200
...,...,...,...,...
5903140,A3GX0FAMEXV6FB,B007SZ0E1K,5.0,1384387200
5903113,A362FM6FYA1SYS,B007SZ0E1K,5.0,1370563200
5902871,A3PDWA32BAMB2Y,B007SZ0E1K,5.0,1389225600
5903261,AEIB3UR05OP7A,B007SZ0E1K,5.0,1392163200


In [102]:
data.sort_values(by=['timestamp'],inplace=True)

In [103]:
ind=pd.DataFrame(data.userId.unique())
ind['index']=range(1, len(ind) + 1)
ind.rename(columns={0:'userId'},inplace=True)
ind
#data.join(ind,on='userId')
data=data.merge(ind,left_on='userId',right_on='userId')
data.drop(columns=['userId'],inplace=True)
data.rename(columns={'index':'userId'},inplace=True)

In [104]:
ind=pd.DataFrame(data.productId.unique())
ind['index']=range(1, len(ind) + 1)
ind.rename(columns={0:'productId'},inplace=True)


data=data.merge(ind,left_on='productId',right_on='productId')
data.drop(columns=['productId'],inplace=True)
data.rename(columns={'index':'productId'},inplace=True)
#data.join(ind,on='userId')

In [105]:
data=data[['userId','productId','rating','timestamp']]
data

Unnamed: 0,userId,productId,rating,timestamp
0,1,1,5.0,956966400
1,2,1,5.0,961200000
2,4,1,1.0,965347200
3,6,1,4.0,994204800
4,7,1,5.0,1006387200
...,...,...,...,...
42936,1216,2263,5.0,1315785600
42937,1246,2263,4.0,1341792000
42938,1301,2263,2.0,1338076800
42939,1346,2263,5.0,1339027200


In [113]:
late_df

Unnamed: 0,userId,productId,rating,timestamp
21470,447,932,4.0,1369785600
21471,508,932,5.0,1370476800
21472,625,932,4.0,1377907200
21473,666,932,5.0,1372291200
21474,848,932,4.0,1369785600
...,...,...,...,...
42936,1216,2263,5.0,1315785600
42937,1246,2263,4.0,1341792000
42938,1301,2263,2.0,1338076800
42939,1346,2263,5.0,1339027200


In [110]:
import math
early_df=data[0:int(len(data)/2)]
late_df=data[int(len(data)/2):]

In [87]:
train, test = python_stratified_split(data=data,ratio=0.70, col_user=header["col_user"], col_item=header["col_item"], seed=42)
#15s
#3s on simplest data

In [35]:
print(len(train))
len(test)

30040


12901

In [90]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    time_now=None, 
    timedecay_formula=True, 
    **header
)

In [91]:
model.fit(train)

2023-04-26 20:38:55,353 INFO     Collecting user affinity matrix
2023-04-26 20:38:55,356 INFO     Calculating time-decayed affinities
2023-04-26 20:38:55,377 INFO     Creating index columns
2023-04-26 20:38:55,404 INFO     Building user affinity sparse matrix
2023-04-26 20:38:55,407 INFO     Calculating item co-occurrence
2023-04-26 20:38:55,479 INFO     Calculating item similarity
2023-04-26 20:38:55,479 INFO     Using jaccard based similarity
2023-04-26 20:38:55,719 INFO     Done training


In [38]:
top_k = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

2023-04-26 15:30:24,496 INFO     Calculating recommendation scores
2023-04-26 15:30:24,545 INFO     Removing seen items


In [39]:
top_k

Unnamed: 0,userId,productId,prediction
0,A100UD67AHFODS,B00F0DD0I6,0.222598
1,A100UD67AHFODS,B00ENZRS76,0.181897
2,A100UD67AHFODS,B00DK2JQOQ,0.157219
3,A100UD67AHFODS,B00FSA8VQ2,0.148283
4,A100UD67AHFODS,B007OY5V68,0.148057
...,...,...,...
15285,AZOK5STV85FBJ,B003Y30334,0.000033
15286,AZOK5STV85FBJ,B005FYNSPK,0.000032
15287,AZOK5STV85FBJ,B000067RC4,0.000032
15288,AZOK5STV85FBJ,B001ID829O,0.000031


In [64]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(col_user='userId', 
              col_item='productId', 
              col_rating='rating', 
              col_prediction='prediction', 
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [65]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 10
MAP:		 0.007229
NDCG:		 0.021981
Precision@K:	 0.016743
Recall@K:	 0.018617


In [122]:
train, test = python_stratified_split(data=late_df,ratio=0.70, col_user=header["col_user"], col_item=header["col_item"], seed=42)
#15s
#3s on simplest data

In [123]:
print(len(train))
len(test)

15030


6441

In [124]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    time_now=None, 
    timedecay_formula=True, 
    **header
)

In [125]:
model.fit(train)

2023-04-26 21:12:39,674 INFO     Collecting user affinity matrix
2023-04-26 21:12:39,676 INFO     Calculating time-decayed affinities
2023-04-26 21:12:39,684 INFO     Creating index columns
2023-04-26 21:12:39,699 INFO     Building user affinity sparse matrix
2023-04-26 21:12:39,700 INFO     Calculating item co-occurrence
2023-04-26 21:12:39,717 INFO     Calculating item similarity
2023-04-26 21:12:39,718 INFO     Using jaccard based similarity
2023-04-26 21:12:39,739 INFO     Done training


In [126]:
top_k = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

2023-04-26 21:12:40,789 INFO     Calculating recommendation scores
2023-04-26 21:12:40,805 INFO     Removing seen items


In [127]:
top_k

Unnamed: 0,userId,productId,prediction
0,37,1399,0.466198
1,37,1445,0.389908
2,37,1215,0.380470
3,37,1889,0.339529
4,37,1389,0.322925
...,...,...,...
14605,1532,1538,0.107445
14606,1532,1793,0.107262
14607,1532,1964,0.095966
14608,1532,1701,0.095761


In [128]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(col_user='userId', 
              col_item='productId', 
              col_rating='rating', 
              col_prediction='prediction', 
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [129]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 10
MAP:		 0.010783
NDCG:		 0.023565
Precision@K:	 0.013826
Recall@K:	 0.026422
