In [1]:
import pandas as pd
import numpy as np
import datetime
#from itertools import product
from scipy.sparse.linalg import svds

from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
from surprise import SVD, SVDpp, Prediction, accuracy

In [2]:
data = pd.read_csv('data_engineered_features500.csv', nrows=60018)
#data = data.query("userid == ['user_000001','user_000002','user_000003','user_000004','user_000005']")

data.rename(columns={'track-name':'track',
                     'artist-name':'artist',
                     'track-total-count':'trackcount',
                     'track-weekday-daytime-count':'trackweekdaydaytimecount',
                     'artist-total-count':'artistcount',
                     'artist-weekday-daytime-count':'artistweekdaydaytimecount'}, inplace=True)

data.drop(['timestamp', 'songlength', 'last-seen-song', 'last-seen-artist', 'gender', 'age'], axis=1, inplace=True)
data = data[[c for c in data if c not in ['skipped']] + ['skipped']]

In [3]:
data.dtypes

60018


userid                       object
track                        object
artist                       object
weekday                       int64
hour                          int64
weekend                       int64
daytime                       int64
trackcount                    int64
trackweekdaydaytimecount      int64
month                         int64
quarter                       int64
artistcount                   int64
artistweekdaydaytimecount     int64
skipped                       int64
dtype: object

## Weekend Period

In [10]:
df = data[['userid', 'track', 'artist', 'weekend', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df['trackartist'] = df['track']+'___'+df['artist']
df['trackartist_weekend'] = df['trackartist']+'___'+df['weekend'].map(str)
df.drop(['track', 'artist', 'weekend', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df = df.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

### Model

In [16]:
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(dense_data[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Specify algorithm
algo = SVDpp()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 0.4254
MAE:  0.3487
------------
Fold 2
RMSE: 0.4297
MAE:  0.3510
------------
Fold 3
RMSE: 0.4342
MAE:  0.3415
------------
------------
Mean RMSE: 0.4297
Mean MAE : 0.3471
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.4254  0.4297  0.4342  0.4297  
MAE     0.3487  0.3510  0.3415  0.3471  


In [17]:
reader = Reader(rating_scale=(0, 1))
algo = SVDpp()

model_data = Dataset.load_from_df(df[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 0.0742
MAE:  0.0304
------------
Fold 2
RMSE: 0.0634
MAE:  0.0283
------------
Fold 3
RMSE: 0.0786
MAE:  0.0310
------------
------------
Mean RMSE: 0.0721
Mean MAE : 0.0299
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.0742  0.0634  0.0786  0.0721  
MAE     0.0304  0.0283  0.0310  0.0299  


## Weekday Period

In [21]:
df2 = data[['userid', 'track', 'artist', 'weekday', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df2['trackartist'] = df2['track']+'___'+df2['artist']
df2['trackartist_weekday'] = df2['trackartist']+'___'+df2['weekday'].map(str)
df2.drop(['track', 'artist', 'weekday', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df2 = df2.groupby(['userid', 'trackartist_weekday'], as_index=False).mean()

In [22]:
model3_data = Dataset.load_from_df(df2[['userid','trackartist_weekday','skipped']], reader)
model3_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf3 = evaluate(algo, model3_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf3)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 0.0914
MAE:  0.0334
------------
Fold 2
RMSE: 0.0958
MAE:  0.0344
------------
Fold 3
RMSE: 0.0948
MAE:  0.0307
------------
------------
Mean RMSE: 0.0940
Mean MAE : 0.0328
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.0914  0.0958  0.0948  0.0940  
MAE     0.0334  0.0344  0.0307  0.0328  
