In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
from surprise import SVD, BaselineOnly, Prediction, accuracy
from sklearn.metrics import roc_auc_score

random.seed(561)

In [2]:
data = pd.read_csv('~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',
                   delimiter="\t", header=None,
                   names = ["userid","timestamp","artistid",
                            "artistname","trackid","trackname"])
data = data.groupby(['userid', 'artistname']).size().reset_index(name='plays')

In [20]:
topN_artists = data.groupby('artistname')['plays'].sum().reset_index(name='plays'). \
    nlargest(500,'plays')

In [4]:
# set to binary of whether a user listed to an artist
data.loc[data['plays'] != 0, 'plays'] = 1
# remove all artists not in the top N
data = data[data.artistname.isin(topN_artists['artistname'])]

# Add all user-artist combos, with no plays = 0
data = data.pivot(index='userid', columns='artistname', values='plays').fillna(0).reset_index()
data = data.melt(id_vars=['userid'], var_name=['artistname'])
data = data.rename(columns = {'value':'plays'})

print(len(data))
data.head()

4955000


Unnamed: 0,userid,artistname,plays
0,user_000001,!!!,0.0
1,user_000002,!!!,0.0
2,user_000003,!!!,0.0
3,user_000004,!!!,1.0
4,user_000005,!!!,0.0


In [21]:
data = data[data.artistname.isin(topN_artists['artistname'])]

print(len(data))
data.head()

495500


Unnamed: 0,userid,artistname,plays
0,user_000001,!!!,0.0
1,user_000002,!!!,0.0
2,user_000003,!!!,0.0
3,user_000004,!!!,1.0
4,user_000005,!!!,0.0


In [22]:
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(data, reader)
model_data.split(n_folds=3)

# We'll use the famous SVD algorithm.
algo = BaselineOnly(bsl_options = {'method': 'als'})

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm BaselineOnly.

------------
Fold 1
Estimating biases using als...
RMSE: 0.3982
MAE:  0.3221
------------
Fold 2
Estimating biases using als...
RMSE: 0.3972
MAE:  0.3219
------------
Fold 3
Estimating biases using als...
RMSE: 0.4001
MAE:  0.3230
------------
------------
Mean RMSE: 0.3985
Mean MAE : 0.3223
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.3982  0.3972  0.4001  0.3985  
MAE     0.3221  0.3219  0.3230  0.3223  


In [23]:
algo = SVD(n_factors = 120, lr_all = 0.01, reg_all = 0.02)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.3570
MAE:  0.2718
------------
Fold 2
RMSE: 0.3562
MAE:  0.2714
------------
Fold 3
RMSE: 0.3589
MAE:  0.2729
------------
------------
Mean RMSE: 0.3574
Mean MAE : 0.2721
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.3570  0.3562  0.3589  0.3574  
MAE     0.2718  0.2714  0.2729  0.2721  


In [48]:
data_size_results = pd.DataFrame(columns = ['DataSize', 'Metric', 'Baseline', 'SVD'])

In [50]:
data_size_results.loc[8] = [500, 'RMSE', .3985, .3574] # 500 artists
data_size_results.loc[9] = [500, 'MAE', .3223, .2721] # 500 artists
data_size_results.loc[6] = [1000, 'RMSE', .3702, .3309] # 1000 artists
data_size_results.loc[7] = [1000, 'MAE', .2782, .2355] # 1000 artists
data_size_results.loc[4] = [2000, 'RMSE', .3297, .2979] # 2000 artists
data_size_results.loc[5] = [2000, 'MAE', .2221, .1915] # 2000 artists
data_size_results.loc[2] = [3000, 'RMSE', .3012, .2750] # 3000 artists
data_size_results.loc[3] = [3000, 'MAE', .1865, .1630] # 3000 artists
data_size_results.loc[0] = [4000, 'RMSE', .2546, .2355] # 4000 artists
data_size_results.loc[1] = [4000, 'MAE', .1366, .1199] # 4000 artists
data_size_results.loc[10] = [5000, 'RMSE', .2547, .2355] # 5000 artists
data_size_results.loc[11] = [5000, 'MAE', .1366, .1199] # 5000 artists

data_size_results

Unnamed: 0,DataSize,Metric,Baseline,SVD
0,4000,RMSE,0.2546,0.2355
1,4000,MAE,0.1366,0.1199
2,3000,RMSE,0.3012,0.275
3,3000,MAE,0.1865,0.163
4,2000,RMSE,0.3297,0.2979
5,2000,MAE,0.2221,0.1915
6,1000,RMSE,0.3702,0.3309
7,1000,MAE,0.2782,0.2355
8,500,RMSE,0.3985,0.3574
9,500,MAE,0.3223,0.2721


In [51]:
data_size_results.to_csv('../data/datasize.csv', sep='\t')