In [1]:
import pandas as pd
import numpy as np
import datetime
from itertools import product
from scipy.sparse.linalg import svds

#from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
#from surprise import SVD, SVDpp, Prediction, accuracy
#from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('data_engineered_features500.csv', nrows=60018)
#data = data.query("userid == ['user_000001','user_000002','user_000003','user_000004','user_000005']")

data.rename(columns={'track-name':'track',
                     'artist-name':'artist',
                     'track-total-count':'trackcount',
                     'track-weekday-daytime-count':'trackweekdaydaytimecount',
                     'artist-total-count':'artistcount',
                     'artist-weekday-daytime-count':'artistweekdaydaytimecount'}, inplace=True)

data.drop(['timestamp', 'songlength', 'last-seen-song', 'last-seen-artist', 'gender', 'age'], axis=1, inplace=True)
data = data[[c for c in data if c not in ['skipped']] + ['skipped']]

In [3]:
data.dtypes

60018


userid                       object
track                        object
artist                       object
weekday                       int64
hour                          int64
weekend                       int64
daytime                       int64
trackcount                    int64
trackweekdaydaytimecount      int64
month                         int64
quarter                       int64
artistcount                   int64
artistweekdaydaytimecount     int64
skipped                       int64
dtype: object

In [10]:
df = data[['userid', 'track', 'artist', 'weekend', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df['trackartist'] = df['track']+'___'+df['artist']
df['trackartist_weekend'] = df['trackartist']+'___'+df['weekend'].map(str)
df.drop(['track', 'artist', 'weekend', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df = df.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

## Imputation

Create every combination of user, trackartist, and period, then impute missing values.

In [11]:
# create combinations
# need to remove duplicates to have only distinct combinations
temp = pd.DataFrame(list(product(df['userid'].drop_duplicates(),
                                 df['trackartist_weekend'].drop_duplicates())),
                   columns = ['userid', 'trackartist_weekend'])
len(temp)

23860

In [12]:
# create dense matrix with all combinations
dense_data = temp.merge(df, how='left',
                      on=['userid', 'trackartist_weekend'])

# create indicator variable to signify whether a value was imputed
dense_data['missing'] = dense_data['skipped'].isnull().astype('int')

# impute missing values
# potentially calculate artist-level probability first,
# then impute that
dense_data = dense_data.fillna(1)

# delete temp dataframe to clear space
del temp

In [13]:
dense_data.describe()

Unnamed: 0,skipped,missing
count,23860.0,23860.0
mean,0.799798,0.795935
std,0.396817,0.403025
min,0.0,0.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,1.0,1.0


# Model

In [16]:
from surprise import Reader, Dataset, evaluate, print_perf
from surprise import SVD, SVDpp, Prediction, accuracy


reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(dense_data[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Specify algorithm
algo = SVDpp()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 0.4254
MAE:  0.3487
------------
Fold 2
RMSE: 0.4297
MAE:  0.3510
------------
Fold 3
RMSE: 0.4342
MAE:  0.3415
------------
------------
Mean RMSE: 0.4297
Mean MAE : 0.3471
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.4254  0.4297  0.4342  0.4297  
MAE     0.3487  0.3510  0.3415  0.3471  
