In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import evaluate, print_perf, GridSearch
#from sklearn.metrics import roc_auc_score

random.seed(561)

In [2]:
#users = pd.read_csv('~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-profile.tsv', header=None)
data = pd.read_csv('~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',
                   delimiter="\t", header=None,
                   names = ["userid","timestamp","artistid",
                            "artistname","trackid","trackname"])

In [3]:
#data = data.rename(columns={0:'userid', 1:'timestamp', 2:'artistid', 3:'artistname', 4:'trackid', 5:'trackname'})

data['timestamp'] = pd.to_datetime(data['timestamp'])

In [4]:
data.head()

Unnamed: 0,userid,timestamp,artistid,artistname,trackid,trackname
0,user_000001,2009-05-04 23:08:57,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04 13:54:10,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04 13:52:04,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04 13:42:52,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04 13:42:11,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


For the mini-project, we are using a smaller dataset. The following transformations will convert the dataset to use number of plays as our metric, grouped by user and artist.

To help with our data cleaning and setting up the matrices, we used [this website](https://jessesw.com/Rec-System/) to guide us.

In [None]:
data = data.groupby(['userid', 'artistname']).size().reset_index(name='plays')

In [None]:
users = list(np.sort(data.userid.unique())) # Get our unique users
artists = list(data.artistname.unique()) # Get our unique artists
quantity = list(data.plays) # All of our plays

rows = data.userid.astype('category', categories = users).cat.codes 
# Get the associated row indices
cols = data.artistname.astype('category', categories = artists).cat.codes 
# Get the associated column indices
plays_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(quantity)))
plays_sparse

In [None]:
# Sparsity of the matrix
matrix_size = plays_sparse.shape[0]*plays_sparse.shape[1] # Number of possible interactions in the matrix
num_plays = len(plays_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_plays/matrix_size))
sparsity

## Dataset Reduction

The sparsity of 99.899% is extremely sparse, even for matrices that are intended to be sparse. We experimented with removing all rare artists, which had minimal effect on the sparsity. Instead, we will include only the top 100 artists, which has some improvement on the sparsity.

In [None]:
rare_artists = data.query("plays < 6"). \
    groupby('artistname').size().reset_index(name='users_listening_to_artist'). \
    query("users_listening_to_artist < 10")
    
#top100_artists = data.groupby('artistname')['plays'].sum().reset_index(name='plays'). \
#    nlargest(100,'plays')

In [None]:
reduced_data = data[~data.artistname.isin(rare_artists['artistname'])]

print(reduced_data.shape, data.shape)

In [None]:
users = list(np.sort(reduced_data.userid.unique())) # Get our unique users
artists = list(reduced_data.artistname.unique()) # Get our unique artists
quantity = list(reduced_data.plays) # All of our plays

rows = reduced_data.userid.astype('category', categories = users).cat.codes 
# Get the associated row indices
cols = reduced_data.artistname.astype('category', categories = artists).cat.codes 
# Get the associated column indices
plays_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(quantity)))

plays_sparse

In [None]:
# Sparsity of the matrix
matrix_size = plays_sparse.shape[0]*plays_sparse.shape[1] # Number of possible interactions in the matrix
num_plays = len(plays_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_plays/matrix_size))
sparsity

While 99.898% sparsity is not great, it is an improvement over the previous matrix, and so we will use this to set up our brute force model.

For SVD, we use the `surprise` package.

In [None]:
usertotal = data.groupby('userid')['plays'].sum().reset_index(name="total_plays")
normalized_data = pd.merge(reduced_data, usertotal)
normalized_data['normalized_plays'] = normalized_data['plays']/normalized_data['total_plays']
normalized_data.drop(['total_plays'], inplace=True, axis=1)

In [None]:
#normalized_data[(normalized_data.plays > 0)]['plays'] = 1
normalized_data.loc[normalized_data['plays'] != 0, 'plays'] = 1

In [None]:
normalized_data.head()

In [None]:
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(normalized_data[['userid', 'artistname', 'plays']], reader)

model_data.split(n_folds=3)

# We'll use the famous SVD algorithm.
algo = SVD(reg_all=.1)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

print_perf(perf)

In [None]:
param_grid = {'n_epochs': np.arange(10,30, 1), 'lr_all': np.arange(0.002,0.014, 0.001),
              'reg_all': np.arange(0.02,0.6, 0.02)}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'MAE'])

model_data = Dataset.load_from_df(normalized_data[['userid', 'artistname', 'plays']], reader)
model_data.split(n_folds=3)

grid_search.evaluate(model_data)

results_df = pd.DataFrame.from_dict(grid_search.cv_results)
#results_df.to_csv(SVD_results, sep='\t')

In [None]:
results_df

In [None]:
#output = pd.DataFrame(predictions)
#output = output.drop(['r_ui', 'details'], axis=1)

#combined = pd.merge(normalized_data,output,left_on=['userid','artistname'],right_on=['uid','iid'])
#combined = combined.drop(['uid', 'iid'], axis=1).set_index('userid')
#combined.head()

##fpr, tpr, thresholds = metrics.roc_curve(combined['normalized_plays'], combined['est'], pos_label=2)
##metrics.auc(fpr, tpr)

#roc_auc_score(combined['normalized_plays'],combined['est'])