In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score as AUC
import warnings
warnings.filterwarnings("ignore")
# import tensorflow as tf

In [2]:
def data_split(data, validation_ratio = 0.15, test_ratio = 0.15):
    """
    Function to split data into train, validation and test based on timestamps
    
    https://stackoverflow.com/questions/42395258/
    
    """
    train_ratio = 1 - validation_ratio - test_ratio
    
    data['time_rank'] = data.groupby('userid')['timestamp'].rank()
    data['user_all_songs_count'] = data['userid'].map(data.groupby('userid')['timestamp'].apply(len))
    data['scaled_time_rank'] = data['time_rank']/ data['user_all_songs_count']
    
    data.drop(['time_rank', 'user_all_songs_count'], axis=1, inplace=True)
    
    train_data = data.loc[data['scaled_time_rank'] <= train_ratio, :]
    validation_data = data.loc[(data['scaled_time_rank'] <= (1 - test_ratio)) & (data['scaled_time_rank'] > train_ratio), :]
    test_data = data.loc[(data['scaled_time_rank'] > (train_ratio + validation_ratio)), :]
    train_data.drop(['timestamp'], axis=1, inplace=True)
    validation_data.drop(['timestamp'], axis=1, inplace=True)
    test_data.drop(['timestamp'], axis=1, inplace=True)
    return train_data, validation_data, test_data
    

In [None]:
pandas_df = pd.read_csv('data_engineered_features500.csv' , encoding = "ISO-8859-1")
pandas_df.head()

In [None]:
train_data, validation_data, test_data = data_split(pandas_df, validation_ratio = 0, test_ratio = 0.15)

## Transform Data

In [None]:
del pandas_df
# merge track, artist, and weekend
train_data['trackartist_weekend'] = train_data['track']+'___'+train_data['artist']+'___'+train_data['weekend'].map(str)
train_data.drop(['track', 'artist', 'weekend'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
train_data = train_data.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

In [None]:
# merge track, artist, and weekend
test_data['trackartist_weekend'] = test_data['track']+'___'+test_data['artist']+'___'+test_data['weekend'].map(str)
test_data.drop(['track', 'artist', 'weekend'], axis=1, inplace=True)

# find rate at which users skip songs by weekend status
test_data = test_data.groupby(['userid', 'trackartist_weekend'], as_index=False).mean()

In [None]:
test_data.to_csv("test_features500.csv", index=False)
train_data.to_csv("train_features500.csv", index=False)