In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import evaluate, print_perf, GridSearch
# from sklearn.metrics import roc_auc_score

random.seed(561)

In [3]:
# users = pd.read_csv('~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-profile.tsv', header=None)
data = pd.read_csv('~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',
                   delimiter="\t", header=None,
                   names = ["userid","timestamp","artistid",
                            "artistname","trackid","trackname"])

b'Skipping line 2120260: expected 6 fields, saw 8\n'
b'Skipping line 2446318: expected 6 fields, saw 8\n'
b'Skipping line 11141081: expected 6 fields, saw 8\n'
b'Skipping line 11152099: expected 6 fields, saw 12\nSkipping line 11152402: expected 6 fields, saw 8\n'
b'Skipping line 11882087: expected 6 fields, saw 8\n'
b'Skipping line 12902539: expected 6 fields, saw 8\nSkipping line 12935044: expected 6 fields, saw 8\n'
b'Skipping line 17589539: expected 6 fields, saw 8\n'


In [4]:
# data = data.rename(columns={0:'userid', 1:'timestamp', 2:'artistid', 3:'artistname', 4:'trackid', 5:'trackname'})

data['timestamp'] = pd.to_datetime(data['timestamp'])

For the mini-project, we are using a smaller dataset. The following transformations will convert the dataset to use number of plays as our metric, grouped by user and artist.

To help with our data cleaning and setting up the matrices, we used [this website](https://jessesw.com/Rec-System/) to guide us.

In [5]:
data = data.groupby(['userid', 'artistname']).size().reset_index(name='plays')

In [None]:
users = list(np.sort(data.userid.unique())) # Get our unique users
artists = list(data.artistname.unique()) # Get our unique artists
quantity = list(data.plays) # All of our plays

rows = data.userid.astype('category', categories = users).cat.codes 
# Get the associated row indices
cols = data.artistname.astype('category', categories = artists).cat.codes 
# Get the associated column indices
plays_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(quantity)))
plays_sparse

In [None]:
# Sparsity of the matrix
matrix_size = plays_sparse.shape[0]*plays_sparse.shape[1] # Number of possible interactions in the matrix
num_plays = len(plays_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_plays/matrix_size))
sparsity

### Dataset Reduction

The data set has a sparsity of 99.899%, which is very low even for matrices that are intended to be sparse. We experimented with removing rare artists and including only the top 100 artists, which had minimal effect on the sparsity.

In [None]:
rare_artists = data.query("plays < 6"). \
    groupby('artistname').size().reset_index(name='users_listening_to_artist'). \
    query("users_listening_to_artist < 10")
    
# top100_artists = data.groupby('artistname')['plays'].sum().reset_index(name='plays'). \
#    nlargest(100,'plays')

In [None]:
reduced_data = data[~data.artistname.isin(rare_artists['artistname'])]

print(reduced_data.shape, data.shape)

In [None]:
users = list(np.sort(reduced_data.userid.unique())) # Get our unique users
artists = list(reduced_data.artistname.unique()) # Get our unique artists
quantity = list(reduced_data.plays) # All of our plays

rows = reduced_data.userid.astype('category', categories = users).cat.codes 
# Get the associated row indices
cols = reduced_data.artistname.astype('category', categories = artists).cat.codes 
# Get the associated column indices
plays_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(quantity)))

plays_sparse

In [None]:
# Sparsity of the matrix
matrix_size = plays_sparse.shape[0]*plays_sparse.shape[1] # Number of possible interactions in the matrix
num_plays = len(plays_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_plays/matrix_size))
sparsity

While 99.898% sparsity is not great, it is an improvement over the previous matrix.

For the SVD algorithm, we use the Surprise package. We modify the plays parameter to be a binary indicator of whether someone has listened to an artist.

In [None]:
# usertotal = data.groupby('userid')['plays'].sum().reset_index(name="total_plays")
# normalized_data = pd.merge(reduced_data, usertotal)
# normalized_data['normalized_plays'] = normalized_data['plays']/normalized_data['total_plays']
# normalized_data.drop(['total_plays'], inplace=True, axis=1)
# normalized_data.loc[normalized_data['plays'] != 0, 'plays'] = 1

In [6]:
data.loc[data['plays'] != 0, 'plays'] = 1

In [None]:
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(data, reader)
model_data.split(n_folds=3)

# We'll use the famous SVD algorithm.
algo = SVD()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE'])
print_perf(perf)

We used the GridSearch class to tune our hyperparameters. We initially included the 'epoch' parameter, which drastically increased the run time of the code below.

In [36]:
# param_grid = {'n_factors': np.arange(60,140,20),
#               'lr_all': np.arange(0.002,0.014, 0.004),'reg_all': np.arange(0.02,0.1, 0.02)}
param_grid = {'n_factors': [60],'lr_all': [.01],'reg_all': np.arange(0.02,0.14, 0.02)}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'])

model_data = Dataset.load_from_df(data[['userid', 'artistname', 'plays']], reader)
model_data.split(n_folds=3)

grid_search.evaluate(model_data)

results_df = pd.DataFrame.from_dict(grid_search.cv_results)

[{'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.02}, {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.040000000000000001}, {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.059999999999999998}, {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.080000000000000002}, {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.10000000000000001}, {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.12000000000000001}]
------------
Parameters combination 1 of 6
params:  {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.02}
------------
Mean RMSE: 0.0056
------------
------------
Parameters combination 2 of 6
params:  {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.040000000000000001}
------------
Mean RMSE: 0.0028
------------
------------
Parameters combination 3 of 6
params:  {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.059999999999999998}
------------
Mean RMSE: 0.0018
------------
------------
Parameters combination 4 of 6
params:  {'n_factors': 60, 'lr_all': 0.01, 'reg_all': 0.080000000000000002}
------------
Mean

After finding the set of hyperparameters with minimal RMSE (factors = 60, learning rate = 0.01 and regularization = 0.08), we iteratively varied one hyperparameter to see how RMSE changes while others are held constant.

For example, we held the learning rate and regularization at their respective optimal values of 0.01 and 0.08, while varying the number of factors from 20 to 120 in increments of 20.

We saved three files: reg.csv, learning.csv and factor.csv.

In [37]:
results_df.to_csv('reg.csv', sep='\t')

In [None]:
#output = pd.DataFrame(predictions)
#output = output.drop(['r_ui', 'details'], axis=1)

#combined = pd.merge(normalized_data,output,left_on=['userid','artistname'],right_on=['uid','iid'])
#combined = combined.drop(['uid', 'iid'], axis=1).set_index('userid')
#combined.head()

##fpr, tpr, thresholds = metrics.roc_curve(combined['normalized_plays'], combined['est'], pos_label=2)
##metrics.auc(fpr, tpr)

#roc_auc_score(combined['normalized_plays'],combined['est'])