In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score as AUC
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf

In [2]:
def data_split(data, validation_ratio = 0.15, test_ratio = 0.15):
    """
    Function to split data into train, validation and test based on timestamps
    
    https://stackoverflow.com/questions/42395258/
    
    """
    train_ratio = 1 - validation_ratio - test_ratio
    
    data['time_rank'] = data.groupby('userid')['timestamp'].rank()
    data['user_all_songs_count'] = data['userid'].map(data.groupby('userid')['timestamp'].apply(len))
    data['scaled_time_rank'] = data['time_rank']/ data['user_all_songs_count']
    
    data.drop(['time_rank', 'user_all_songs_count'], axis=1, inplace=True)
    
    train_data = data.loc[data['scaled_time_rank'] <= train_ratio, :]
    validation_data = data.loc[(data['scaled_time_rank'] <= (1 - test_ratio)) & (data['scaled_time_rank'] > train_ratio), :]
    test_data = data.loc[(data['scaled_time_rank'] > (train_ratio + validation_ratio)), :]
    #train_data.drop(['scaled_time_rank', 'timestamp', 'userid'], axis=1, inplace=True)
    #validation_data.drop(['scaled_time_rank', 'timestamp', 'userid'], axis=1, inplace=True)
    #test_data.drop(['scaled_time_rank', 'timestamp', 'userid'], axis=1, inplace=True)
    return train_data, validation_data, test_data
    

In [3]:
pandas_df = pd.read_csv('data_engineered_features500.csv') 
pandas_df["gender_int"] = 0
pandas_df.loc[pandas_df["gender"] == "m", "gender_int"] = 1
pandas_df.loc[pandas_df["gender"] == "f", "gender_int"] = 2
#pandas_df.drop(["track-name","artist-name", "songlength", "gender"], axis=1, inplace=True)
#pandas_df.drop(["songlength", "gender"], axis=1, inplace=True)
pandas_df = pandas_df.fillna(0)

In [4]:
pandas_df.head()

Unnamed: 0,userid,track-name,artist-name,timestamp,weekday,hour,weekend,daytime,user-track-total-count,track-weekday-count,...,user-song-skip-percentage,user-artist-skips,user-artist-skip-percentage,global-song-skips,global-artist-skips,artist_total_count,song_total_count,global-song-skip-percentage,global-artist-skip-percentage,gender_int
0,user_000001,The Launching Of Big Face,Plaid & Bob Jaroc,2006-08-13 13:59:20,6,13,1,3,1,1,...,0.0,0,0.0,0,0,1,1,0.0,0.0,1
1,user_000001,Zn Zero,Plaid & Bob Jaroc,2006-08-13 14:03:29,6,14,1,3,1,1,...,0.0,0,0.0,0,0,2,1,0.0,0.0,1
2,user_000001,The Return Of Super Barrio - End Credits,Plaid & Bob Jaroc,2006-08-13 14:10:43,6,14,1,3,1,1,...,0.0,0,0.0,0,0,3,1,0.0,0.0,1
3,user_000001,Dayvan Cowboy,Boards Of Canada,2006-08-13 15:44:17,6,15,1,3,1,1,...,0.0,0,0.0,0,0,1,1,0.0,0.0,1
4,user_000001,A Moment Of Clarity,Boards Of Canada,2006-08-13 16:46:52,6,16,1,3,1,1,...,1.0,1,0.333333,1,1,3,1,1.0,1.0,1


In [5]:
train_data, validation_data, test_data = data_split(pandas_df, validation_ratio = 0.15, test_ratio = 0.15)

In [6]:
del pandas_df

In [7]:
train_y = np.array(train_data["skipped"])
validation_y = np.array(validation_data["skipped"])
test_y = np.array(test_data["skipped"])

train_userid = np.array(train_data["userid"])
validation_userid = np.array(validation_data["userid"])
test_userid = np.array(test_data["userid"])

train_timestamp = np.array(train_data["timestamp"])
validation_timestamp = np.array(validation_data["timestamp"])
test_timestamp = np.array(test_data["timestamp"])

train_track_name = np.array(train_data["track-name"])
validation_track_name = np.array(validation_data["track-name"])
test_track_name = np.array(test_data["track-name"])

train_artist_name = np.array(train_data["artist-name"])
validation_artist_name = np.array(validation_data["artist-name"])
test_artist_name = np.array(test_data["artist-name"])

train_weekend = np.array(train_data["weekend"])
validation_weekend = np.array(validation_data["weekend"])
test_weekend = np.array(test_data["weekend"])

train_data.drop(["skipped", 'scaled_time_rank', 'timestamp', 'userid', 
                 "track-name","artist-name", "songlength", "gender"], axis=1, inplace=True)
validation_data.drop(["skipped", 'scaled_time_rank', 'timestamp', 'userid', 
                      "track-name","artist-name", "songlength", "gender"], axis=1, inplace=True)
test_data.drop(["skipped", 'scaled_time_rank', 'timestamp', 'userid', 
                "track-name","artist-name", "songlength", "gender"], axis=1, inplace=True)


In [8]:
features_list = list(train_data)
train_data = (train_data - train_data.mean())/train_data.std()
validation_data = (validation_data - train_data.mean())/train_data.std()
test_data = (test_data - train_data.mean())/train_data.std()

In [9]:
np.shape(train_data)

(5369831, 25)

In [69]:
def neural_net_model():
    # create model
    model = tf.keras.models.Sequential()
    
    model.add(tf.keras.layers.Dense(2, input_dim=25, activation='relu',
                                    kernel_initializer='glorot_normal'))
    
    #model.add(tf.keras.layers.Dropout(0.3))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    # Compile model
    #sgd = tf.keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])
    return model

In [70]:
# Define the Neural Network model
# Using Scikit-Learn wrapper in Keras, which is now in Tensorflow
deep_net = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=neural_net_model, epochs=5, 
                                                          batch_size=256, verbose=0)

### Train the NN

In [71]:
deep_net.fit(np.array(train_data), train_y)

<tensorflow.python.keras._impl.keras.callbacks.History at 0x1a26bae0f0>

In [72]:
val_prediction = deep_net.predict_proba(np.array(validation_data))[:,1]

In [73]:
val_auc = AUC(validation_y, val_prediction)
print("Validation AUC Score:", val_auc)

Validation AUC Score: 0.705472154137


In [74]:
test_prediction = deep_net.predict_proba(np.array(test_data))[:,1]

In [75]:
test_auc = AUC(test_y, test_prediction)
print("Test AUC Score:", test_auc)

Test AUC Score: 0.679825944415


In [76]:
test_pred_df = pd.DataFrame({"userid": test_userid, "track-name": test_track_name,
                             "artist-name": test_artist_name, "weekend": test_weekend, 
                             "timestamp":test_timestamp, "skipped": test_y, 
                             "prediction": test_prediction})

In [77]:
test_pred_df.to_csv("NNResults2.csv", index=False)

### Train a Logistic Regression Model with L2 Regularization

In [19]:
linear_model = LR(penalty='l2', dual=False, tol=0.001, C=0.9, fit_intercept=True, 
                  intercept_scaling=1, class_weight=None, random_state=42, 
                  solver='liblinear', max_iter=50, multi_class='ovr', 
                  verbose=0, warm_start=False, n_jobs=1)

In [20]:
linear_model.fit(np.array(train_data), train_y)

LogisticRegression(C=0.9, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False)

In [21]:
linear_val_prediction = linear_model.predict_proba(np.array(validation_data))[:,1]

In [22]:
linear_val_auc = AUC(validation_y, linear_val_prediction)
print("Validation AUC Score:", linear_val_auc)

Validation AUC Score: 0.520982793133
