# ezSASRec QuickStart

- example data source: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [3]:
import pandas as pd 
import pickle
import multiprocessing
from sasrec.util import filter_k_core, SASRecDataSet, load_model
from sasrec.model import SASREC
from sasrec.sampler import WarpSampler

# Preprocessing

In [4]:
path = 'your path'

In [5]:
df = pd.read_csv('ratings.csv')
df = df.rename({'userId':'userID','movieId':'itemID','timestamp':'time'},axis=1)\
       .sort_values(by=['userID','time'])\
       .drop(['rating','time'],axis=1)\
       .reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,userID,itemID
0,1,2762
1,1,54503
2,1,112552
3,1,96821
4,1,5577


In [14]:
# filter data
# every user and item will appear more than 6 times in filtered_df

filtered_df = filter_k_core(df, 7)

Original: 270896 users and 45115 items
Final: 243377 users and 24068 items


In [15]:
# make maps (encoder)

user_set, item_set = set(filtered_df['userID'].unique()), set(filtered_df['itemID'].unique())
user_map = dict()
item_map = dict()
for u, user in enumerate(user_set):
    user_map[user] = u+1
for i, item in enumerate(item_set):
    item_map[item] = i+1

maps = (user_map, item_map)   

In [18]:
# Encode filtered_df

filtered_df["userID"] = filtered_df["userID"].apply(lambda x: user_map[x])
filtered_df["itemID"] = filtered_df["itemID"].apply(lambda x: item_map[x])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6467346), Label(value='0 / 6467346…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6467346), Label(value='0 / 6467346…

In [19]:
# save data and maps

# save sasrec data    
filtered_df.to_csv('sasrec_data.txt', sep="\t", header=False, index=False)

# save maps
with open('maps.pkl','wb') as f:
    pickle.dump(maps, f)

# Load data and Train model

In [20]:
# load data

data = SASRecDataSet('sasrec_data.txt')
data.split() # train, val, test split
              # the last interactions of each user is used for test
              # the last but one will be used for validation
              # others will be used for train

In [21]:
# make model and warmsampler for batch training

max_len = 100
hidden_units = 128
batch_size = 2048

model = SASREC(
    item_num=data.itemnum,
    seq_max_len=max_len,
    num_blocks=2,
    embedding_dim=hidden_units,
    attention_dim=hidden_units,
    attention_num_heads=2,
    dropout_rate=0.2,
    conv_dims = [hidden_units, hidden_units],
    l2_reg=0.00001
)

sampler = WarpSampler(data.user_train, data.usernum, data.itemnum, batch_size=batch_size, maxlen=max_len, n_workers=multiprocessing.cpu_count())

In [22]:
# train model

model.train(
          data,
          sampler,
          num_epochs=3, 
          batch_size=batch_size, 
          lr=0.001, 
          val_epoch=1,
          val_target_user_n=1000, 
          target_item_n=-1,
          auto_save=True,
          path = path,
          exp_name='exp_example',
        )

epoch 1 / 3 -----------------------------




Evaluating...




epoch: 1, test (NDCG@10: 0.04607630127474612, HR@10: 0.097)
best score model updated and saved
epoch 2 / 3 -----------------------------




Evaluating...




epoch: 2, test (NDCG@10: 0.060855185638025944, HR@10: 0.118)
best score model updated and saved
epoch 3 / 3 -----------------------------




Evaluating...




epoch: 3, test (NDCG@10: 0.07027207563856912, HR@10: 0.139)
best score model updated and saved


# Predict

In [35]:
# load trained model

model = load_model(path,'exp_example')

## get score

In [36]:
# get user-item score

# make inv_user_map
inv_user_map = {v: k for k, v in user_map.items()}

# sample target_user
model.sample_val_users(data, 100)
encoded_users = model.val_users

# get scores
score = model.get_user_item_score(data,
                          [inv_user_map[u] for u in encoded_users], # user_list containing raw(not-encoded) userID 
                          [1,2,3], # item_list containing raw(not-encoded) itemID
                          user_map,
                          item_map,   
                          batch_size=10
                        )

10


100%|██████████| 10/10 [00:00<00:00, 29.67batch/s]


In [37]:
score.head()

Unnamed: 0,user_id,1,2,3
0,1525,5.596944,4.241653,3.804743
1,1756,4.535607,2.694459,0.85844
2,2408,5.883061,4.65596,4.691791
3,2462,5.084695,2.942075,2.773376
4,3341,5.532438,4.34815,4.07374


## get recommendation

In [38]:
# get top N recommendation 

reco = model.recommend_item(data,
                            user_map,
                            [inv_user_map[u] for u in encoded_users],
                            is_test=True,
                            top_n=5)

100%|██████████| 100/100 [00:04<00:00, 21.10it/s]


In [39]:
# returned tuple contains topN recommendations for each user

reco

{1525: [(456, 6.0680223),
  (355, 6.033769),
  (379, 5.9833336),
  (591, 5.9718275),
  (776, 5.8978705)],
 1756: [(7088, 5.735977),
  (15544, 5.5946136),
  (5904, 5.500249),
  (355, 5.492655),
  (22149, 5.4117346)],
 2408: [(456, 5.976555),
  (328, 5.8824606),
  (588, 5.8614006),
  (264, 5.7114534),
  (299, 5.649914)],
 2462: [(259, 6.3445344),
  (591, 6.2664876),
  (295, 6.105361),
  (355, 6.0698805),
  (1201, 5.8477645)],
 3341: [(110, 5.510764),
  (1, 5.4927354),
  (259, 5.4851904),
  (161, 5.467624),
  (208, 5.2486935)],
 3762: [(526, 5.9978333),
  (853, 5.88327),
  (1259, 5.781148),
  (2725, 5.6146665),
  (1186, 5.51476)],
 11812: [(355, 6.1667094),
  (591, 5.6400404),
  (259, 5.5790396),
  (1, 5.3540926),
  (7296, 5.2689633)],
 15061: [(2960, 4.82584),
  (317, 4.761106),
  (4181, 4.746534),
  (1701, 4.706372),
  (1186, 4.6909685)],
 17220: [(259, 6.432206),
  (591, 6.2935815),
  (355, 6.185576),
  (479, 6.00958),
  (456, 5.8726926)],
 20312: [(2535, 4.614324),
  (317, 4.39097),
 