In [1]:
import pandas as pd
import numpy as np
import datetime

from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
from surprise import SVD, SVDpp, Prediction, accuracy

In [2]:
data = pd.read_csv('data_engineered_features500.csv') #, nrows=60018)
#data = data.query("userid == ['user_000001','user_000002','user_000003','user_000004','user_000005']")

data.rename(columns={'track-name':'track',
                     'artist-name':'artist',
                     'track-total-count':'trackcount',
                     'track-weekday-daytime-count':'trackweekdaydaytimecount',
                     'artist-total-count':'artistcount',
                     'artist-weekday-daytime-count':'artistweekdaydaytimecount'}, inplace=True)

data.drop(['timestamp', 'songlength', 'last-seen-song', 'last-seen-artist', 'gender', 'age'], axis=1, inplace=True)
data = data[[c for c in data if c not in ['skipped']] + ['skipped']]

In [3]:
data.dtypes

userid                       object
track                        object
artist                       object
weekday                       int64
hour                          int64
weekend                       int64
daytime                       int64
trackcount                    int64
trackweekdaydaytimecount      int64
month                         int64
quarter                       int64
artistcount                   int64
artistweekdaydaytimecount     int64
skipped                       int64
dtype: object

## Weekend Period

In [6]:
df = data[['userid', 'track', 'artist', 'weekend', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df['trackartist'] = df['track']+'___'+df['artist']
df['trackartist_weekend'] = df['trackartist']+'___'+df['weekend'].map(str)
df.drop(['track', 'artist', 'weekend', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df = df.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

### Model

In [7]:
reader = Reader(rating_scale=(0, 1))
algo = SVDpp()


# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(df[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 0.0876
MAE:  0.0268
------------
Fold 2
RMSE: 0.0852
MAE:  0.0265
------------
Fold 3
RMSE: 0.0868
MAE:  0.0264
------------
------------
Mean RMSE: 0.0865
Mean MAE : 0.0265
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.0876  0.0852  0.0868  0.0865  
MAE     0.0268  0.0265  0.0264  0.0265  


## Weekday Period

In [10]:
del df, model_data
df2 = data[['userid', 'track', 'artist', 'weekday', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df2['trackartist'] = df2['track']+'___'+df2['artist']
df2['trackartist_weekday'] = df2['trackartist']+'___'+df2['weekday'].map(str)
df2.drop(['track', 'artist', 'weekday', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df2 = df2.groupby(['userid', 'trackartist_weekday'], as_index=False).mean()

In [None]:
model2_data = Dataset.load_from_df(df2[['userid','trackartist_weekday','skipped']], reader)
model2_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf2 = evaluate(algo, model2_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf2)

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1

## Quarter Period

In [None]:
del df2, model_data2, model_data

df = data[['userid', 'track', 'artist', 'quarter', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df['trackartist'] = df['track']+'___'+df['artist']
df['trackartist_quarter'] = df['trackartist']+'___'+df2['quarter'].map(str)
df.drop(['track', 'artist', 'quarter', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df2 = df2.groupby(['userid', 'trackartist_quarter'], as_index=False).mean()

In [None]:
# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(df[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)