In [1]:
import numpy as np
import pandas as pd

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('updated_train.csv')
test = pd.read_csv('updated_test.csv')

In [7]:
train.columns

Index(['ID', 'datetime', 'siteid', 'offerid', 'category', 'merchant', 'click',
       'hour__0', 'hour__1', 'hour__2', 'hour__3', 'hour__4', 'hour__5',
       'hour__6', 'hour__7', 'hour__8', 'hour__9', 'hour__10', 'hour__11',
       'hour__12', 'hour__13', 'hour__14', 'hour__15', 'hour__16', 'hour__17',
       'hour__18', 'hour__19', 'hour__20', 'hour__21', 'hour__22', 'hour__23',
       'weekday__0', 'weekday__1', 'weekday__2', 'weekday__3', 'weekday__4',
       'weekday__5', 'weekday__6', 'a', 'b', 'c', 'd', 'e', 'f', 'brow__Edge',
       'brow__Opera', 'brow__Safari', 'brow__ff', 'brow__gc', 'brow__ie',
       'Desktop', 'Mobile', 'Tablet'],
      dtype='object')

In [6]:
test.columns

Index(['ID', 'datetime', 'siteid', 'offerid', 'category', 'merchant', 'a', 'b',
       'c', 'd', 'e', 'f', 'weekday__0', 'weekday__5', 'weekday__6', 'hour__0',
       'hour__1', 'hour__2', 'hour__3', 'hour__4', 'hour__5', 'hour__6',
       'hour__7', 'hour__8', 'hour__9', 'hour__10', 'hour__11', 'hour__12',
       'hour__13', 'hour__14', 'hour__15', 'hour__16', 'hour__17', 'hour__18',
       'hour__19', 'hour__20', 'hour__21', 'hour__22', 'hour__23',
       'brow__Edge', 'brow__Opera', 'brow__Safari', 'brow__ff', 'brow__gc',
       'brow__ie', 'Desktop', 'Mobile', 'Tablet', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4'],
      dtype='object')

In [28]:
# create aggregate features
site_offer_count = train.groupby(['siteid','offerid']).size().reset_index()
site_offer_count.columns = ['siteid','offerid','site_offer_count']

site_offer_count_test = test.groupby(['siteid','offerid']).size().reset_index()
site_offer_count_test.columns = ['siteid','offerid','site_offer_count']

site_cat_count = train.groupby(['siteid','category']).size().reset_index()
site_cat_count.columns = ['siteid','category','site_cat_count']

site_cat_count_test = test.groupby(['siteid','category']).size().reset_index()
site_cat_count_test.columns = ['siteid','category','site_cat_count']

site_mcht_count = train.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count.columns = ['siteid','merchant','site_mcht_count']

site_mcht_count_test = test.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count_test.columns = ['siteid','merchant','site_mcht_count']

In [29]:
# joining all files
agg_df = [site_offer_count,site_cat_count,site_mcht_count]
agg_df_test = [site_offer_count_test,site_cat_count_test,site_mcht_count_test]

for x in agg_df:
    train = train.merge(x)
    
for x in agg_df_test:
    test = test.merge(x)

In [30]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,click,hour__0,hour__1,hour__2,...,brow__Safari,brow__ff,brow__gc,brow__ie,Desktop,Mobile,Tablet,site_offer_count,site_cat_count,site_mcht_count
0,IDMUUBNwz,2017-01-20 21:48:49,7615567.0,138618,11837,79474990,1,0,0,0,...,0,0,0,1,0,1,0,1,1,1
1,IDCLNGRY0,2017-01-20 11:23:06,-1.0,239178,40339,71040875,0,0,0,0,...,0,0,0,0,0,0,1,2,7413,159
2,IDzy5rQHC,2017-01-20 12:36:36,-1.0,665155,40339,71040875,0,0,0,0,...,0,0,0,0,0,0,1,1,7413,159
3,IDKeMPCuv,2017-01-17 12:25:30,-1.0,630542,40339,71040875,0,0,0,0,...,0,1,0,0,0,1,0,1,7413,159
4,ID6FpGizv,2017-01-18 07:39:12,-1.0,140335,40339,71040875,0,0,0,0,...,0,1,0,0,1,0,0,1,7413,159


In [44]:
train.columns

Index(['ID', 'datetime', 'siteid', 'offerid', 'category', 'merchant', 'click',
       'hour__0', 'hour__1', 'hour__2', 'hour__3', 'hour__4', 'hour__5',
       'hour__6', 'hour__7', 'hour__8', 'hour__9', 'hour__10', 'hour__11',
       'hour__12', 'hour__13', 'hour__14', 'hour__15', 'hour__16', 'hour__17',
       'hour__18', 'hour__19', 'hour__20', 'hour__21', 'hour__22', 'hour__23',
       'weekday__0', 'weekday__1', 'weekday__2', 'weekday__3', 'weekday__4',
       'weekday__5', 'weekday__6', 'a', 'b', 'c', 'd', 'e', 'f', 'brow__Edge',
       'brow__Opera', 'brow__Safari', 'brow__ff', 'brow__gc', 'brow__ie',
       'Desktop', 'Mobile', 'Tablet', 'site_offer_count', 'site_cat_count',
       'site_mcht_count'],
      dtype='object')

In [45]:
# select columns to choose
cols_to_use = ['siteid', 'offerid', 'category', 'merchant','a', 'b',
       'c', 'd', 'e', 'f', 'weekday__0', 'weekday__5', 'weekday__6', 'hour__0',
       'hour__1', 'hour__2', 'hour__3', 'hour__4', 'hour__5', 'hour__6',
       'hour__7', 'hour__8', 'hour__9', 'hour__10', 'hour__11', 'hour__12',
       'hour__13', 'hour__14', 'hour__15', 'hour__16', 'hour__17', 'hour__18',
       'hour__19', 'hour__20', 'hour__21', 'hour__22', 'hour__23',
       'brow__Edge', 'brow__Opera', 'brow__Safari', 'brow__ff', 'brow__gc',
       'brow__ie', 'Desktop', 'Mobile', 'Tablet','site_offer_count', 'site_cat_count',
       'site_mcht_count']

In [None]:
# standarise data before training
scaler = StandardScaler().fit(train[cols_to_use])
strain = scaler.transform(train[cols_to_use])
stest = scaler.transform(test[cols_to_use])

In [None]:
# train validation split
X_train, X_valid, Y_train, Y_valid = train_test_split(strain, train.click, test_size = 0.5, random_state=2017)

In [None]:
print (X_train.shape)
print (X_valid.shape)
print (Y_train.shape)
print (Y_valid.shape)

In [None]:
# model architechture
def keras_model(train):
    
    input_dim = train.shape[1]
    classes = 2
    
    model = Sequential()
    model.add(Dense(100, activation = 'relu', input_shape = (input_dim,))) #layer 1
    model.add(Dense(30, activation = 'relu')) #layer 2
    model.add(Dense(classes, activation = 'sigmoid')) #output
    model.compile(optimizer = 'adam', loss='binary_crossentropy',metrics = ['accuracy'])
    return model

callback = EarlyStopping(monitor='val_acc',patience=3)

Now, let's understand the architechture of this neural network:
We have 13 input features.
We connect these 13 features with 100 neurons in the first hidden layer (call layer 1).
Visualise in mind this way: The lines connecting input to neurons are assigned a weight (randomly assigned).
The neurons in layer 1 receive a weighted sum (bias + woxo + w1x1...) of inputs while passing through relu activation function.
Relu works this way: If the value of weighted sum is less than zero, it sets it to 0, if the value of weighted sum of positive, it considers the value as is.
The output from layer 1 is input to layer 2 which has 30 neurons. Again, the input passes through relu activation function.
Finally, the output of layer 2 is fed into the final layer which has 2 neurons. The output passes through sigmoid function. Sigmoid functions makes sure that probabilities stays within 0 and 1 and we get the output predictions.

In [None]:
# one hot target columns
Y_train = to_categorical(Y_train)
Y_valid = to_categorical(Y_valid)

In [None]:
# train model
model = keras_model(X_train)
model.fit(X_train, Y_train, 1000, 50, callbacks=[callback],validation_data=(X_valid, Y_valid),shuffle=True)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
# check validation accuracy
vpreds = model.predict_proba(X_valid)[:,1]
roc_auc_score(y_true = Y_valid[:,1], y_score=vpreds)

In [None]:
# predict on test data
test_preds = model.predict_proba(stest)[:,1]

In [None]:
# create submission file
submit = pd.DataFrame({'ID':test.ID, 'click':test_preds})
submit.to_csv('keras_starter.csv', index=False)