In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import evaluate, print_perf

random.seed(561)

In [4]:
users = pd.read_table('~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-profile.tsv', header=0)
data = pd.read_table('~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',
                     header=-1,
                     #nrows=20000000,
                     error_bad_lines=False)

b'Skipping line 2120260: expected 6 fields, saw 8\n'
b'Skipping line 2446318: expected 6 fields, saw 8\n'
b'Skipping line 11141081: expected 6 fields, saw 8\n'
b'Skipping line 11152099: expected 6 fields, saw 12\nSkipping line 11152402: expected 6 fields, saw 8\n'
b'Skipping line 11882087: expected 6 fields, saw 8\n'
b'Skipping line 12902539: expected 6 fields, saw 8\nSkipping line 12935044: expected 6 fields, saw 8\n'
b'Skipping line 17589539: expected 6 fields, saw 8\n'


In [5]:
data = data.rename(columns={0:'userid', 1:'timestamp', 2:'artistid', 3:'artistname', 4:'trackid', 5:'trackname'})

data['timestamp'] = pd.to_datetime(data['timestamp'])

For the mini-project, we are using a smaller dataset. The following transformations will convert the dataset to use number of plays as our metric, grouped by user and artist.

To help with our data cleaning and setting up the matrices, we used [this website](https://jessesw.com/Rec-System/) to guide us.

In [6]:
data = data.groupby(['userid', 'artistname']).size().reset_index(name='plays')

In [7]:
users = list(np.sort(data.userid.unique())) # Get our unique users
artists = list(data.artistname.unique()) # Get our unique artists
quantity = list(data.plays) # All of our plays

rows = data.userid.astype('category', categories = users).cat.codes 
# Get the associated row indices
cols = data.artistname.astype('category', categories = artists).cat.codes 
# Get the associated column indices
plays_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(quantity)))
plays_sparse

<992x897419 sparse matrix of type '<class 'numpy.int64'>'
	with 897419 stored elements in Compressed Sparse Row format>

In [8]:
# Sparsity of the matrix
matrix_size = plays_sparse.shape[0]*plays_sparse.shape[1] # Number of possible interactions in the matrix
num_plays = len(plays_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_plays/matrix_size))
sparsity

99.8991935483871

## Dataset Reduction

The sparsity of 99.899% is extremely sparse, even for matrices that are intended to be sparse. We experimented with removing all rare artists, which had minimal effect on the sparsity. Instead, we will include only the top 100 artists, which has some improvement on the sparsity.

In [9]:
rare_artists = data.query("plays < 6"). \
    groupby('artistname').size().reset_index(name='users_listening_to_artist'). \
    query("users_listening_to_artist < 10")
    
top100_artists = data.groupby('artistname')['plays'].sum().reset_index(name='plays'). \
    nlargest(100,'plays')

In [10]:
reduced_data = data[~data.artistname.isin(rare_artists['artistname'])]

print(reduced_data.shape, data.shape)

(596817, 3) (897419, 3)


In [11]:
users = list(np.sort(reduced_data.userid.unique())) # Get our unique users
artists = list(reduced_data.artistname.unique()) # Get our unique artists
quantity = list(reduced_data.plays) # All of our plays

rows = reduced_data.userid.astype('category', categories = users).cat.codes 
# Get the associated row indices
cols = reduced_data.artistname.astype('category', categories = artists).cat.codes 
# Get the associated column indices
plays_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(users), len(quantity)))

plays_sparse

<992x596817 sparse matrix of type '<class 'numpy.int64'>'
	with 596817 stored elements in Compressed Sparse Row format>

In [12]:
# Sparsity of the matrix
matrix_size = plays_sparse.shape[0]*plays_sparse.shape[1] # Number of possible interactions in the matrix
num_plays = len(plays_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_plays/matrix_size))
sparsity

99.8991935483871

While 99.898% sparsity is not great, it is an improvement over the previous matrix, and so we will use this to set up our brute force model.

For SVD, we use the `surprise` package.

In [13]:
usertotal = reduced_data.groupby('userid')['plays'].sum().reset_index(name="total_plays")
normalized_data = pd.merge(reduced_data, usertotal)
normalized_data['normalized_plays'] = normalized_data['plays']/normalized_data['total_plays']
normalized_data.drop(['total_plays'], inplace=True, axis=1)

normalized_data.head()

Unnamed: 0,userid,artistname,plays,normalized_plays
0,user_000001,4 Wings,2,0.000153
1,user_000001,4Hero,146,0.01116
2,user_000001,A New Funky Generation,2,0.000153
3,user_000001,A Reminiscent Drive,1,7.6e-05
4,user_000001,A Taste Of Honey,5,0.000382


In [15]:
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(normalized_data[['userid', 'artistname', 'normalized_plays']], reader)

model_data.split(n_folds=3)

# We'll use the famous SVD algorithm.
algo = SVD()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.0202
MAE:  0.0093
------------
Fold 2
RMSE: 0.0201
MAE:  0.0092
------------
Fold 3
RMSE: 0.0202
MAE:  0.0092
------------
------------
Mean RMSE: 0.0202
Mean MAE : 0.0092
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.0202  0.0201  0.0202  0.0202  
MAE     0.0093  0.0092  0.0092  0.0092  


In [None]:
# THEORY: is the model just guessing very low values? Adjust the penalty to remove this possibility