In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score as AUC
import warnings
warnings.filterwarnings("ignore")
# import tensorflow as tf

In [2]:
def data_split(data, validation_ratio = 0.15, test_ratio = 0.15):
    """
    Function to split data into train, validation and test based on timestamps
    
    https://stackoverflow.com/questions/42395258/
    
    """
    train_ratio = 1 - validation_ratio - test_ratio
    
    data['time_rank'] = data.groupby('userid')['timestamp'].rank()
    data['user_all_songs_count'] = data['userid'].map(data.groupby('userid')['timestamp'].apply(len))
    data['scaled_time_rank'] = data['time_rank']/ data['user_all_songs_count']
    
    data.drop(['time_rank', 'user_all_songs_count'], axis=1, inplace=True)
    
    train_data = data.loc[data['scaled_time_rank'] <= train_ratio, :]
    validation_data = data.loc[(data['scaled_time_rank'] <= (1 - test_ratio)) & (data['scaled_time_rank'] > train_ratio), :]
    test_data = data.loc[(data['scaled_time_rank'] > (train_ratio + validation_ratio)), :]
    return train_data, validation_data, test_data
    

In [3]:
pandas_df = pd.read_csv('data_engineered_features500.csv' , encoding = "ISO-8859-1")
pandas_df.head()

Unnamed: 0,userid,track-name,artist-name,timestamp,weekday,hour,weekend,daytime,track-total-count,track-weekday-daytime-count,songlength,last-seen-song,month,quarter,skipped,artist-total-count,artist-weekday-daytime-count,last-seen-artist,gender,age
0,user_000001,The Launching Of Big Face,Plaid & Bob Jaroc,2006-08-13 13:59:20,6,13,1,3,1,1,,,8,3,0,1,1,,m,
1,user_000001,Zn Zero,Plaid & Bob Jaroc,2006-08-13 14:03:29,6,14,1,3,1,1,0 days 00:04:09.000000000,,8,3,0,2,2,0.002882,m,
2,user_000001,The Return Of Super Barrio - End Credits,Plaid & Bob Jaroc,2006-08-13 14:10:43,6,14,1,3,1,1,0 days 00:07:14.000000000,,8,3,0,3,3,0.005023,m,
3,user_000001,Dayvan Cowboy,Boards Of Canada,2006-08-13 15:44:17,6,15,1,3,1,1,0 days 00:04:04.000000000,,8,3,0,1,1,,m,
4,user_000001,A Moment Of Clarity,Boards Of Canada,2006-08-13 16:46:52,6,16,1,3,1,1,0 days 00:00:40.000000000,,8,3,1,3,3,0.000463,m,


In [4]:
train_data, validation_data, test_data = data_split(pandas_df, validation_ratio = 0, test_ratio = 0.15)

## Transform Data

In [5]:
del pandas_df
train_data.rename(columns={'track-name':'track',
                           'artist-name':'artist'}, inplace=True)

train_data = train_data[['userid', 'track', 'artist', 'weekend', 'skipped']]

# merge track, artist, and weekend
train_data['trackartist_weekend'] = train_data['track']+'___'+train_data['artist']+'___'+train_data['weekend'].map(str)
train_data.drop(['track', 'artist', 'weekend'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
train_data = train_data.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

train_data.head()

Unnamed: 0,userid,trackartist_weekend,skipped
0,user_000001,'84 Pontiac Dream___Boards Of Canada___0,0.833333
1,user_000001,'84 Pontiac Dream___Boards Of Canada___1,0.5
2,user_000001,(It'S Just) Talk___Pat Metheny Group___0,0.0
3,user_000001,(It'S Just) Talk___Pat Metheny Group___1,0.0
4,user_000001,...Short Wave Lies___The Black Dog___0,0.0


In [6]:
test_data.to_csv("testset500.csv", index=False)

In [7]:
test_data.rename(columns={'track-name':'track',
                          'artist-name':'artist'}, inplace=True)

test_data = test_data[['userid', 'track', 'artist', 'weekend', 'skipped']]

# merge track, artist, and weekend
test_data['trackartist_weekend'] = test_data['track']+'___'+test_data['artist']+'___'+test_data['weekend'].map(str)
test_data.drop(['track', 'artist', 'weekend'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
test_data = test_data.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

### Write to csv

In [8]:
test_data.to_csv("test_features500.csv", index=False)
train_data.to_csv("train_features500.csv", index=False)