In [None]:
import pandas as pd
import numpy as np
import datetime
#from itertools import product
from scipy.sparse.linalg import svds

from surprise import Reader, Dataset, evaluate, print_perf, GridSearch
from surprise import SVD, SVDpp, Prediction, accuracy

In [None]:
data = pd.read_csv('data_engineered_features500.csv', nrows=60018, encoding = "ISO-8859-1")
#data = data.query("userid == ['user_000001','user_000002','user_000003','user_000004','user_000005']")

data.rename(columns={'track-name':'track',
                     'artist-name':'artist',
                     'track-total-count':'trackcount',
                     'track-weekday-daytime-count':'trackweekdaydaytimecount',
                     'artist-total-count':'artistcount',
                     'artist-weekday-daytime-count':'artistweekdaydaytimecount'}, inplace=True)

data.drop(['timestamp', 'songlength', 'last-seen-song', 'last-seen-artist', 'gender', 'age'], axis=1, inplace=True)
data = data[[c for c in data if c not in ['skipped']] + ['skipped']]

In [None]:
data.dtypes

## Weekend Period

In [None]:
df = data[['userid', 'track', 'artist', 'weekend', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df['trackartist'] = df['track']+'___'+df['artist']
df['trackartist_weekend'] = df['trackartist']+'___'+df['weekend'].map(str)
df.drop(['track', 'artist', 'weekend', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df = df.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

In [None]:
df.head()

### Model

In [None]:
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
model_data = Dataset.load_from_df(dense_data[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Specify algorithm
algo = SVDpp()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

In [None]:
reader = Reader(rating_scale=(0, 1))
algo = SVDpp()

model_data = Dataset.load_from_df(df[['userid','trackartist_weekend','skipped']], reader)
model_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, model_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf)

## Weekday Period

In [None]:
df2 = data[['userid', 'track', 'artist', 'weekday', 'skipped']].copy()

# merge track and artist to one column, so that
# when doing combinations, we don't have every track with every artist
df2['trackartist'] = df2['track']+'___'+df2['artist']
df2['trackartist_weekday'] = df2['trackartist']+'___'+df2['weekday'].map(str)
df2.drop(['track', 'artist', 'weekday', 'trackartist'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
df2 = df2.groupby(['userid', 'trackartist_weekday'], as_index=False).mean()

In [None]:
model3_data = Dataset.load_from_df(df2[['userid','trackartist_weekday','skipped']], reader)
model3_data.split(n_folds=3)

# Evaluate performances of our algorithm on the dataset.
perf3 = evaluate(algo, model3_data, measures=['RMSE', 'MAE'])

# predictions = predict(data['userid'], data['artistname'], data['plays'])
print_perf(perf3)