In [14]:
import pandas as pd
import numpy as np
import datetime

from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
from surprise import SVD, SVDpp, Prediction, accuracy
from surprise import NormalPredictor, BaselineOnly

In [2]:
data = pd.read_csv('data_engineered_features500.csv') #, nrows=60018)
#data = data.query("userid == ['user_000001','user_000002','user_000003','user_000004','user_000005']")

data.rename(columns={'track-name':'track',
                     'artist-name':'artist',
                     'track-total-count':'trackcount',
                     'track-weekday-daytime-count':'trackweekdaydaytimecount',
                     'artist-total-count':'artistcount',
                     'artist-weekday-daytime-count':'artistweekdaydaytimecount'}, inplace=True)

data.drop(['timestamp', 'songlength', 'last-seen-song', 'last-seen-artist', 'gender', 'age'], axis=1, inplace=True)
data = data[[c for c in data if c not in ['skipped']] + ['skipped']]

In [3]:
data.dtypes

userid                       object
track                        object
artist                       object
weekday                       int64
hour                          int64
weekend                       int64
daytime                       int64
trackcount                    int64
trackweekdaydaytimecount      int64
month                         int64
quarter                       int64
artistcount                   int64
artistweekdaydaytimecount     int64
skipped                       int64
dtype: object

## Weekend Period

In [6]:
df = data[['userid', 'track', 'artist', 'weekend', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df['trackartist'] = df['track']+'___'+df['artist']
df['trackartist_weekend'] = df['trackartist']+'___'+df['weekend'].map(str)
df.drop(['track', 'artist', 'weekend', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df = df.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

### Model

In [7]:
reader = Reader(rating_scale=(0, 1))
algo = SVDpp()


# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(df[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 0.0876
MAE:  0.0268
------------
Fold 2
RMSE: 0.0852
MAE:  0.0265
------------
Fold 3
RMSE: 0.0868
MAE:  0.0264
------------
------------
Mean RMSE: 0.0865
Mean MAE : 0.0265
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.0876  0.0852  0.0868  0.0865  
MAE     0.0268  0.0265  0.0264  0.0265  


## Benchmarks

In [16]:
normalAlgo = NormalPredictor()
normalPerf = evaluate(normalAlgo, model_data, measures=['RMSE', 'MAE'])
print_perf(normalPerf)

Evaluating RMSE, MAE of algorithm NormalPredictor.

------------
Fold 1
RMSE: 0.1135
MAE:  0.0566
------------
Fold 2
RMSE: 0.1128
MAE:  0.0569
------------
Fold 3
RMSE: 0.1140
MAE:  0.0567
------------
------------
Mean RMSE: 0.1135
Mean MAE : 0.0568
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.1135  0.1128  0.1140  0.1135  
MAE     0.0566  0.0569  0.0567  0.0568  


In [17]:
bsl_options = {'method': 'sgd'}
baselineAlgo = BaselineOnly(bsl_options=bsl_options)
baselinePerf = evaluate(baselineAlgo, model_data, measures=['RMSE', 'MAE'])
print_perf(baselinePerf)

Evaluating RMSE, MAE of algorithm BaselineOnly.

------------
Fold 1
Estimating biases using sgd...
RMSE: 0.0865
MAE:  0.0262
------------
Fold 2
Estimating biases using sgd...
RMSE: 0.0847
MAE:  0.0258
------------
Fold 3
Estimating biases using sgd...
RMSE: 0.0875
MAE:  0.0261
------------
------------
Mean RMSE: 0.0863
Mean MAE : 0.0260
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.0865  0.0847  0.0875  0.0863  
MAE     0.0262  0.0258  0.0261  0.0260  
