In [1]:
import pandas as pd
import numpy as np
import datetime

from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
from surprise import SVD, SVDpp, Prediction, accuracy
from surprise import NormalPredictor, BaselineOnly

from sklearn.metrics import roc_auc_score as AUC

In [2]:
train = pd.read_csv('train_features500.csv')
train.head()

Unnamed: 0,userid,trackartist_weekend,skipped
0,user_000001,'84 Pontiac Dream___Boards Of Canada___0,0.833333
1,user_000001,'84 Pontiac Dream___Boards Of Canada___1,0.5
2,user_000001,(It'S Just) Talk___Pat Metheny Group___0,0.0
3,user_000001,(It'S Just) Talk___Pat Metheny Group___1,0.0
4,user_000001,...Short Wave Lies___The Black Dog___0,0.0


## Model

In [3]:
reader = Reader(rating_scale=(0, 1))
algo = SVDpp()

trainset = Dataset.load_from_df(train[['userid','trackartist_weekend','skipped']], reader)
trainset = trainset.build_full_trainset()

algo.train(trainset)

## Test

In [4]:
test = pd.read_csv('test_features500.csv')

In [5]:
#pred_df = pd.DataFrame(columns = ['userid', 'trackartist_weekend', 'prediction'])

pred_lol = []
test_array = np.array(test)
for n, row in enumerate(test_array):
    p = list(algo.predict(row[0], row[1]))
    row_pred = [p[0], p[1], p[3]]
    pred_lol.append(row_pred)
    #pred_df.loc[n,:] = row_pred

In [6]:
pred_df = pd.DataFrame(columns = ['userid', 'trackartist_weekend', 'prediction'], data=pred_lol)
pred_df.head()

Unnamed: 0,userid,trackartist_weekend,prediction
0,user_000001,(It'S Just) Talk___Pat Metheny Group___1,0.014129
1,user_000001,1919 (Live_2009_4_15)___åæ¬é¾ä¸___0,0.03158
2,user_000001,1919 (Live_2009_4_15)___åæ¬é¾ä¸___1,0.03158
3,user_000001,33 000 Honeybees___Minilogue___0,0.037345
4,user_000001,33 000 Honeybees___Minilogue___1,0.021534


# Model Results

In [7]:
# troublesome rows
# the underscore is throwing problems in the mapping
testdf = pred_df[pred_df['trackartist_weekend'].str.contains("____")]
testdf.index

Int64Index([ 25154,  25155,  34966,  53388,  53389,  65875, 179756, 179757,
            189582, 189583, 233198, 293083, 293084],
           dtype='int64')

In [36]:
pred_df_copy = pred_df.drop([ 25154,  25155,  34966,  53388,  53389,  65875, 179756, 179757,
            189582, 189583, 233198, 293083, 293084, 25395, 25396, 25404, 25405, 274870], axis=0)
pred_df.loc[[25154,  25155,  34966,  53388,  53389,  65875, 179756, 179757,
            189582, 189583, 233198, 293083, 293084, 25395, 25396, 25404, 25405, 274870],:]

Unnamed: 0,userid,trackartist_weekend,prediction
25154,user_000075,The World Is Our ______This Will Destroy You___0,0.0
25155,user_000075,The World Is Our ______This Will Destroy You___1,0.040445
34966,user_000103,The World Is Our ______This Will Destroy You___0,0.0
53388,user_000156,The World Is Our ______This Will Destroy You___0,0.0
53389,user_000156,The World Is Our ______This Will Destroy You___1,0.042997
65875,user_000197,Jag Vet Inte Vem Jag Ãr____HÃ¥kan HellstrÃ¶m___0,0.0
179756,user_000554,The World Is Our ______This Will Destroy You___0,0.297087
179757,user_000554,The World Is Our ______This Will Destroy You___1,0.34726
189582,user_000590,The World Is Our ______This Will Destroy You___0,0.0
189583,user_000590,The World Is Our ______This Will Destroy You___1,0.046829


In [37]:
pred_df_copy[['track-name', 'artist-name', 'weekend']] = pred_df_copy['trackartist_weekend'].str.split('___', expand = True, n=2)
pred_df_copy.drop(['trackartist_weekend'], axis=1, inplace=True)
pred_df_copy['weekend'] = pred_df_copy['weekend'].astype(int)
pred_df_copy['prediction'] = pred_df_copy['prediction'].astype('float')

In [12]:
truth = pd.read_csv('testset500.csv')
truth = truth[['userid', 'track-name', 'artist-name', 'weekend', 'timestamp', 'skipped']]

In [13]:
drop_indices = list(truth[truth['track-name'].str.contains(r"_$|___|h___0| ___ ")].index) + list(truth[truth['artist-name'].str.contains(r"_$")].index)

In [14]:
truth.drop(drop_indices, axis=0, inplace=True)

In [43]:
eval_df = pd.merge(truth, pred_df_copy,
                   on = ['userid', 'track-name', 'artist-name', 'weekend'],
                   how = 'left')
eval_df = eval_df.fillna(1)

In [44]:
val_auc = AUC(eval_df[['skipped']], eval_df[['prediction']])
val_auc

0.80765050212604428

## Benchmarks

In [45]:
print("RMSE:",np.sqrt(np.mean((eval_df['skipped'] - eval_df['prediction']) ** 2)))
print("MAE:",np.absolute(eval_df['skipped'] - eval_df['prediction']).mean())

RMSE: 0.13595882552
MAE: 0.0338142359007


In [None]:
testset = Dataset.load_from_df(test[['userid','trackartist_weekend','skipped']], reader)

normalAlgo = NormalPredictor()
normalPerf = evaluate(normalAlgo, testset, measures=['RMSE', 'MAE'])
print_perf(normalPerf)

Evaluating RMSE, MAE of algorithm NormalPredictor.

------------
Fold 1
RMSE: 0.1357
MAE:  0.0680
------------
Fold 2
RMSE: 0.1359
MAE:  0.0672
------------
Fold 3
RMSE: 0.1392
MAE:  0.0685
------------
Fold 4
RMSE: 0.1384
MAE:  0.0681
------------
Fold 5
RMSE: 0.1380
MAE:  0.0684
------------
------------
Mean RMSE: 0.1374
Mean MAE : 0.0681
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.1357  0.1359  0.1392  0.1384  0.1380  0.1374  
MAE     0.0680  0.0672  0.0685  0.0681  0.0684  0.0681  


In [None]:
bsl_options = {'method': 'sgd'}
baselineAlgo = BaselineOnly(bsl_options=bsl_options)
baselinePerf = evaluate(baselineAlgo, testset, measures=['RMSE', 'MAE'])
print_perf(baselinePerf)

Evaluating RMSE, MAE of algorithm BaselineOnly.

------------
Fold 1
Estimating biases using sgd...
RMSE: 0.0979
MAE:  0.0276
------------
Fold 2
Estimating biases using sgd...
RMSE: 0.0979
MAE:  0.0272
------------
Fold 3
Estimating biases using sgd...
RMSE: 0.1015
MAE:  0.0282
------------
Fold 4
Estimating biases using sgd...
RMSE: 0.0999
MAE:  0.0281


In [None]:
eval_df.to_csv('SVDppResults.csv', index=False)