In [1]:
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als

In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib
%matplotlib inline

In [3]:
frac = 0.625

In [4]:
# read in training set
train_df = pd.read_parquet('/scratch/work/courses/DSGA1004-2021/MSD/cf_train.parquet').sample(frac=frac)
train_df

Unnamed: 0,user_id,count,track_id
46252073,0bff5ebcb73f1d13760c89a4e1585871b8fe0e49,7,TRUVWGR128E07957F0
10147269,a6cfe14ded0d9f1e20d03fb267203c92ee8a7a07,1,TRSSDHO128F9310EA5
15035589,5fba58e3b4b8e6f20f7b62833ff8405ad6ca08f0,6,TRXPDHG12903D017BA
17670643,91cf50e35e3fa924450d86ecae248b7cea113f0b,10,TRTLIGC128F147DF33
42387984,ceeeb16bb6c740fc5c2f7c2ae7be1d22f2d695a9,8,TRAWLYS12903CCFEFC
...,...,...,...
10686682,05b3f56275c55ece808b61ca6bd2951ef45d1c8c,1,TRQTVFZ128F92CC895
28447504,9785676e8b8e5a51a5d2591767782e1e5475bdf9,2,TRIVUMW128F425980E
37340087,e335f4c5eef7ad22a3d47a1bc6b4eb34400e0b5d,1,TRMAJAC128F92E23D7
2178336,292c79c84fdc73f82144445444da37e3880cefba,3,TRKHNVB12903C99C9A


In [5]:
# read in validation set
val_df = pd.read_parquet('/scratch/work/courses/DSGA1004-2021/MSD/cf_validation.parquet').sample(frac=frac)
val_df

Unnamed: 0,user_id,count,track_id
134917,fe1eef3ed3f2815b8da00337fd86cbbe3585d430,11,TRCJAHJ128E07815B6
58969,6ecd41c00671f919782356d58384f98f7f71af90,5,TRZDSBY128F930B938
24940,2e7de24e238fff06cdc00b8119689cfdfa164a59,10,TRUCHHA128EF3435EA
32600,3d750ec3ebed91aa6ad948884de1a1388ccff498,1,TRPISTL128F4239F9B
3935,07be90466b79abe390160c5c9c815028282ba789,1,TRJGCVP128F14A48EA
...,...,...,...
84482,9f741882d5b8871f280f06b98e7ffaa353628314,1,TRXAHWB128F930B151
125560,ed9306de6fe5560efab9cfd5a6614c259b9ecc42,1,TRXQHCN128F42910AD
79625,96a8e0bc26a36207ba8fc3819f05c6ee20cbadf8,15,TRWHESJ128F93132B1
40649,4c82312f9bbcc5959d3488629f84f13511942f81,1,TRORLTF128F146DE1B


In [6]:
# clean up training dataframe
train = train_df.reset_index(drop=True, inplace = False)
train = train.rename(columns = {'user_id':'user', 'track_id':'item', 'count':'rating'}, inplace = False)
cols = ['user', 'item', 'rating']
train = train[cols]
train.head()

Unnamed: 0,user,item,rating
0,0bff5ebcb73f1d13760c89a4e1585871b8fe0e49,TRUVWGR128E07957F0,7
1,a6cfe14ded0d9f1e20d03fb267203c92ee8a7a07,TRSSDHO128F9310EA5,1
2,5fba58e3b4b8e6f20f7b62833ff8405ad6ca08f0,TRXPDHG12903D017BA,6
3,91cf50e35e3fa924450d86ecae248b7cea113f0b,TRTLIGC128F147DF33,10
4,ceeeb16bb6c740fc5c2f7c2ae7be1d22f2d695a9,TRAWLYS12903CCFEFC,8


In [7]:
# clean up validation dataframe
val = val_df.reset_index(drop=True, inplace = False)
val = val.rename(columns = {'user_id':'user', 'track_id':'item', 'count':'rating'}, inplace = False)
cols = ['user', 'item', 'rating']
val = val[cols]
val.head()

Unnamed: 0,user,item,rating
0,fe1eef3ed3f2815b8da00337fd86cbbe3585d430,TRCJAHJ128E07815B6,11
1,6ecd41c00671f919782356d58384f98f7f71af90,TRZDSBY128F930B938,5
2,2e7de24e238fff06cdc00b8119689cfdfa164a59,TRUCHHA128EF3435EA,10
3,3d750ec3ebed91aa6ad948884de1a1388ccff498,TRPISTL128F4239F9B,1
4,07be90466b79abe390160c5c9c815028282ba789,TRJGCVP128F14A48EA,1


In [8]:
test_df = pd.read_parquet('/scratch/work/courses/DSGA1004-2021/MSD/cf_validation.parquet').sample(frac=frac)
test = test_df.reset_index(drop=True, inplace = False)
test = test.rename(columns = {'user_id':'user', 'track_id':'item', 'count':'rating'}, inplace = False)
cols = ['user', 'item', 'rating']
test = test[cols]
test.head()

Unnamed: 0,user,item,rating
0,6172079671840349c197c7e99ab92e26b68ee94e,TRWVVMX128F1460888,1
1,ff11f4bd097a5dd909d68cf0dec6996ab5df0a80,TRDLHZL128E078FC74,1
2,d5e9f7ee28017dae2318142d97accc9443b3f49a,TRPOPLZ12903CA44C2,1
3,988ee10c1b876b43803f4dde64f14efd789f5e62,TRABFDT12903CADD73,1
4,afdc48da07c041a2f8506caccae2f644baf1a559,TRAIYZW128F9330F7D,1


In [9]:
# define algorithm
# features = rank (latent factors), iterations = maxIter, reg = regParam
ALS = als.ImplicitMF(features=200, iterations=20, reg=0.1)
ALS

<lenskit.algorithms.als.ImplicitMF at 0x15006079f9d0>

In [10]:
# clone algorithm
model = util.clone(ALS)

# adapt to recommender interface
model = Recommender.adapt(model)

# fit to training set
start = time.time()
model.fit(train)
end = time.time()
run_time = end - start

print(run_time)

# unique users from validation set
unique_users = val.user.unique()

# run recommender to generate top 500 items for each user
recs = batch.recommend(algo=model, users=unique_users, n=500)

# time to fit model

print(recs)

2084.872410297394
                       item     score  \
0        TRLGMFJ128F4217DBE  0.856160   
1        TRAEHHJ12903CF492F  0.804080   
2        TRONYHY128F92C9D11  0.796160   
3        TRGXQES128F42BA5EB  0.776158   
4        TRIXAZF128F421EE64  0.774327   
...                     ...       ...   
4985495  TREYHTK12903C9B625  0.100572   
4985496  TRCYOEL12903CC3B4A  0.100524   
4985497  TRICQHX128F93269FC  0.100520   
4985498  TRCLAMX128F147BC1D  0.100310   
4985499  TRDTDIT128F427ED15  0.100297   

                                             user  rank  
0        fe1eef3ed3f2815b8da00337fd86cbbe3585d430     1  
1        fe1eef3ed3f2815b8da00337fd86cbbe3585d430     2  
2        fe1eef3ed3f2815b8da00337fd86cbbe3585d430     3  
3        fe1eef3ed3f2815b8da00337fd86cbbe3585d430     4  
4        fe1eef3ed3f2815b8da00337fd86cbbe3585d430     5  
...                                           ...   ...  
4985495  f872419318dcf04e4b6ea3a6e60c6b5662822ecb   496  
4985496  f872419318dcf04e

In [11]:
# calculate precision
#rla = topn.RecListAnalysis()
#rla.add_metric(topn.precision)
#results = rla.compute(recs, val)
#results.head(50)

In [None]:
def ap_user(group, val=False):
    if val:
        df = val[val['user'] == group['user'].iloc[0]]
    else:
        df = test[test['user'] == group['user'].iloc[0]]
    # print(group.dtypes)
    # print(val.dtypes)
    
    # After this join, a correct prediction is a row where rating is not NaN
    group = group.merge(df, on=['user', 'item'], how='left', suffixes=['_group','_val'])
    
    
    # Take cumulative sum; then at index i, value corresponds to how many preds
    # up to and including i are True
    precisions = (~group['rating'].isnull()).cumsum() / range(1, 501)
    
    # Return average precision
    return precisions.sum() / 500
    
def average_precision(df, val=True):
    df = df.groupby('user')
    res = df.apply(ap_user)
    return res
    
res = average_precision(recs, val=False)
res.mean()