In [1]:
import pandas as pd
import numpy as np
import datetime

from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
from surprise import SVD, SVDpp, Prediction, accuracy
from surprise import NormalPredictor, BaselineOnly

from sklearn.metrics import roc_auc_score as AUC

In [2]:
train = pd.read_csv('train_features500.csv')
train.head()

Unnamed: 0,userid,trackartist_weekend,skipped
0,user_000001,'84 Pontiac Dream___Boards Of Canada___0,0.833333
1,user_000001,'84 Pontiac Dream___Boards Of Canada___1,0.5
2,user_000001,(It'S Just) Talk___Pat Metheny Group___0,0.0
3,user_000001,(It'S Just) Talk___Pat Metheny Group___1,0.0
4,user_000001,...Short Wave Lies___The Black Dog___0,0.0


## Model

In [4]:
reader = Reader(rating_scale=(0, 1))
algo = SVDpp()

trainset = Dataset.load_from_df(train[['userid','trackartist_weekend','skipped']], reader)
trainset = trainset.build_full_trainset()

algo.train(trainset)

## Test

In [5]:
test = pd.read_csv('test_features500.csv')

In [None]:
pred_df = pd.DataFrame(columns = ['userid', 'trackartist_weekend', 'prediction'])

test_array = np.array(test)
for n, row in enumerate(test_array):
    p = list(algo.predict(row[0], row[1]))
    row_pred = [p[0], p[1], p[3]]
    pred_df.loc[n,:] = row_pred

In [None]:
pred_df.head()

# Model Results

In [None]:
pred_df[['track-name', 'artist-name', 'weekend']] = pred_df['trackartist_weekend'].str.split('___', expand = True, n=3)
pred_df.drop(['trackartist_weekend'], axis=1, inplace=True)
pred_df['weekend'] = pred_df['weekend'].astype(int)
pred_df['prediction'] = pred_df['prediction'].astype(int)


truth = pd.read_csv('testset500.csv')
truth = truth[['userid', 'track-name', 'artist-name', 'weekend', 'skipped']]

eval_df = pd.merge(truth, pred_df,
                   on = ['userid', 'track-name', 'artist-name', 'weekend'],
                   how = 'left')

In [None]:
# imputation

eval_df['prediction'] = np.where(eval_df['prediction']>0.6, 1, 0)

val_auc = AUC(truth[['skipped']], truth[['prediction']])

## Benchmarks

In [None]:
## RMSE and MAE for pred, not truth

In [None]:
testset = Dataset.load_from_df(test[['userid','trackartist_weekend','skipped']], reader)

normalAlgo = NormalPredictor()
normalPerf = evaluate(normalAlgo, testset, measures=['RMSE', 'MAE'])
print_perf(normalPerf)

In [None]:
bsl_options = {'method': 'sgd'}
baselineAlgo = BaselineOnly(bsl_options=bsl_options)
baselinePerf = evaluate(baselineAlgo, testset, measures=['RMSE', 'MAE'])
print_perf(baselinePerf)