In [1]:
'''
    This script provides code for training a neural network with entity embeddings
    of the 'cat' variables. For more details on entity embedding, see:
    https://github.com/entron/entity-embedding-rossmann
    
    8-Fold training with 3 averaged runs per fold. Results may improve with more folds & runs.
'''

import numpy as np
import pandas as pd

#random seeds for stochastic parts of neural network 
np.random.seed(10)
from tensorflow import set_random_seed
set_random_seed(15)

from keras.models import Sequential, Model
from keras.layers import *
from keras.layers.embeddings import Embedding

from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [2]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH='/home/amit.dingare/safedriverprediction/'

In [3]:
#Data loading & preprocessing
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

X_train, y_train = df_train.iloc[:,2:], df_train.target
X_test = df_test.iloc[:,1:]

In [4]:
df_train.iloc[:,2:].shape

(595212, 57)

In [5]:
cols_use = [c for c in X_train.columns if (not c.startswith('ps_calc_'))]

In [6]:
len(cols_use)

37

In [7]:
X_train = X_train[cols_use]
X_test = X_test[cols_use]

col_vals_dict = {c: list(X_train[c].unique()) for c in X_train.columns if c.endswith('_cat')}

In [8]:
catcols = [i for i in X_train.columns.tolist() if '_cat' in i]

In [9]:
len(catcols)

14

In [10]:
catcols

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [11]:
X_train[catcols].head()

Unnamed: 0,ps_ind_02_cat,ps_ind_04_cat,ps_ind_05_cat,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat
0,2,1,0,10,1,-1,0,1,4,1,0,0,1,12
1,1,0,0,11,1,-1,0,-1,11,1,1,2,1,19
2,4,1,0,7,1,-1,0,-1,14,1,1,2,1,60
3,1,0,0,7,1,0,0,1,11,1,1,3,1,104
4,2,1,0,11,1,-1,0,-1,14,1,1,2,1,82


In [12]:
embed_cols = []
cat_sz = {}
for c in col_vals_dict:
    if len(col_vals_dict[c])>2:
        embed_cols.append(c)
        print(c + ': %d values' % len(col_vals_dict[c])) #look at value counts to know the embedding dimensions
        cat_sz[c] = len(col_vals_dict[c])
print('\n')

ps_ind_02_cat: 5 values
ps_ind_04_cat: 3 values
ps_ind_05_cat: 8 values
ps_car_01_cat: 13 values
ps_car_02_cat: 3 values
ps_car_03_cat: 3 values
ps_car_04_cat: 10 values
ps_car_05_cat: 3 values
ps_car_06_cat: 18 values
ps_car_07_cat: 3 values
ps_car_09_cat: 6 values
ps_car_10_cat: 3 values
ps_car_11_cat: 104 values




In [13]:
cat_sz

{'ps_ind_02_cat': 5,
 'ps_ind_04_cat': 3,
 'ps_ind_05_cat': 8,
 'ps_car_01_cat': 13,
 'ps_car_02_cat': 3,
 'ps_car_03_cat': 3,
 'ps_car_04_cat': 10,
 'ps_car_05_cat': 3,
 'ps_car_06_cat': 18,
 'ps_car_07_cat': 3,
 'ps_car_09_cat': 6,
 'ps_car_10_cat': 3,
 'ps_car_11_cat': 104}

In [14]:
emb_szs = {c:(cat_sz[c], min(50, (cat_sz[c]+1)//2)) for c in cat_sz}

In [15]:
emb_szs_items = [d for d in emb_szs.values()]

In [16]:
emb_szs

{'ps_ind_02_cat': (5, 3),
 'ps_ind_04_cat': (3, 2),
 'ps_ind_05_cat': (8, 4),
 'ps_car_01_cat': (13, 7),
 'ps_car_02_cat': (3, 2),
 'ps_car_03_cat': (3, 2),
 'ps_car_04_cat': (10, 5),
 'ps_car_05_cat': (3, 2),
 'ps_car_06_cat': (18, 9),
 'ps_car_07_cat': (3, 2),
 'ps_car_09_cat': (6, 3),
 'ps_car_10_cat': (3, 2),
 'ps_car_11_cat': (104, 50)}

In [17]:
def build_embedding_network_condensed():
    
    inputs = []
    embeddings = []
    
    for i in emb_szs.keys():
        input = Input(shape=(1,))
        x,y = emb_szs[i]
        embedding = Embedding(x, y, input_length=1)(input)
        embedding = Reshape(target_shape=(y,),name=str(i)+'_Embedding')(embedding)
        inputs.append(input)
        embeddings.append(embedding)
    
    input_numeric = Input(shape=(24,))
    embedding_numeric = Dense(16)(input_numeric) 
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)

    x = Concatenate(name='Concatenate_Embedding')(embeddings)
    x = Dense(80, activation='relu')(x)
    x = Dropout(.35)(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(.15)(x)
    x = Dense(10, activation='relu')(x)
    x = Dropout(.15)(x)
    output = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs, output)
    
    #print ("Model Summary\n".format(model.summary()))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [18]:
#converting data to list format to match the network structure
def preproc(X_train, X_val, X_test):

    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
     
    #the rest of the columns
    other_cols = [c for c in X_train.columns if (not c in embed_cols)]
    input_list_train.append(X_train[other_cols].values)
    input_list_val.append(X_val[other_cols].values)
    input_list_test.append(X_test[other_cols].values)
    
#     print ('\ninput_list_train:{}'.format(input_list_train))
#     print ('\ninput_list_val:{}'.format(input_list_val))
#     print ('\ninput_list_test:{}'.format(input_list_test))
    
    return input_list_train, input_list_val, input_list_test

In [19]:
#gini scoring function from kernel at: 
#https://www.kaggle.com/tezdhar/faster-gini-calculation
def ginic(actual, pred):
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
    return giniSum / n

In [20]:
def gini_normalizedc(a, p):
    return ginic(a, p) / ginic(a, a)

In [21]:
#network training
K = 10
runs_per_fold = 5
n_epochs = 15

cv_ginis = []
full_val_preds = np.zeros(np.shape(X_train)[0])
y_preds = np.zeros((np.shape(X_test)[0],K))

kfold = StratifiedKFold(n_splits = K, 
                            random_state = 231, 
                            shuffle = True)    

In [22]:
for i, (f_ind, outf_ind) in enumerate(kfold.split(X_train, y_train)):

    X_train_f, X_val_f = X_train.loc[f_ind].copy(), X_train.loc[outf_ind].copy()
    y_train_f, y_val_f = y_train[f_ind], y_train[outf_ind]
    
    X_test_f = X_test.copy()
    
    #upsampling adapted from kernel: 
    #https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283
    pos = (pd.Series(y_train_f == 1))
    
    # Add positive examples
    X_train_f = pd.concat([X_train_f, X_train_f.loc[pos]], axis=0)
    y_train_f = pd.concat([y_train_f, y_train_f.loc[pos]], axis=0)
    
    # Shuffle data
    idx = np.arange(len(X_train_f))
    np.random.shuffle(idx)
    X_train_f = X_train_f.iloc[idx]
    y_train_f = y_train_f.iloc[idx]
    
    #preprocessing
    proc_X_train_f, proc_X_val_f, proc_X_test_f = preproc(X_train_f, X_val_f, X_test_f)
    
    #track oof prediction for cv scores
    val_preds = 0
    
    for j in range(runs_per_fold):
    
        NN = build_embedding_network_condensed()
        
        NN.fit(proc_X_train_f, y_train_f.values, epochs=n_epochs, batch_size=4096, verbose=0)
   
        val_preds += NN.predict(proc_X_val_f)[:,0] / runs_per_fold
        y_preds[:,i] += NN.predict(proc_X_test_f)[:,0] / runs_per_fold
        
    full_val_preds[outf_ind] += val_preds
        
    cv_gini = gini_normalizedc(y_val_f.values, val_preds)
    cv_ginis.append(cv_gini)
    print ('\nFold %i prediction cv gini: %.5f\n' %(i,cv_gini))
    
print ("Model Summary for the final fold\n".format(NN.summary()))
print('Mean out of fold gini: %.5f' % np.mean(cv_ginis))
print('Full validation gini: %.5f' % gini_normalizedc(y_train.values, full_val_preds))

y_pred_final = np.mean(y_preds, axis=1)

df_sub = pd.DataFrame({'id' : df_test.id, 
                       'target' : y_pred_final},
                       columns = ['id','target'])
df_sub.to_csv('NN_EntityEmbed_10fold-sub.csv', index=False)

pd.DataFrame(full_val_preds).to_csv('NN_EntityEmbed_10fold-val_preds.csv',index=False)


Fold 0 prediction cv gini: 0.28772


Fold 1 prediction cv gini: 0.26847


Fold 2 prediction cv gini: 0.25056


Fold 3 prediction cv gini: 0.28482


Fold 4 prediction cv gini: 0.28610


Fold 5 prediction cv gini: 0.26206


Fold 6 prediction cv gini: 0.27961


Fold 7 prediction cv gini: 0.28448


Fold 8 prediction cv gini: 0.26663


Fold 9 prediction cv gini: 0.28184

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_687 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_688 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_689 (InputLayer)          (Non

In [23]:
embedded_features_model = Model(inputs=NN.input, outputs=NN.get_layer('Concatenate_Embedding').output) 
embedded_features_train = embedded_features_model.predict(proc_X_train_f)[:,:-13]

In [24]:
embedded_features_train.shape

(555217, 96)

In [25]:
embedded_features_model.predict(proc_X_train_f)[:,:-13]

array([[-0.17373,  0.09946,  0.10456, -0.08657, -0.25705, -0.12835, -0.1625 ,  0.0929 ,  0.16287,  0.0916 ,
        -0.07949, -0.03925, -0.07207, -0.09129, -0.04177,  0.00712,  0.26455,  0.07682,  0.17709, -0.12988,
        ..., -0.02699,  0.05418,  0.1136 , -0.01724,  0.0436 , -0.07899, -0.10464, -0.01904, -0.07723,
         0.03296, -0.09001, -0.13234, -0.09115, -0.10347, -0.08182, -0.00458,  0.03011, -1.40777,  0.26317,
         2.46061],
       [-0.17373,  0.09946,  0.10456,  0.11014,  0.12718, -0.12835, -0.1625 ,  0.0929 ,  0.16287,  0.0916 ,
        -0.07949, -0.03925, -0.07207, -0.09129, -0.04177,  0.00712, -0.10368, -0.0181 ,  0.17709, -0.12988,
        ..., -0.09136,  0.02739, -0.04026, -0.07989, -0.01876,  0.03388,  0.10495, -0.03836, -0.05234,
        -0.06513, -0.0303 , -0.05424,  0.01626,  0.03762,  0.04872,  0.05526, -0.00815, -1.48212,  0.35387,
         2.57248],
       [ 0.27252, -0.13356, -0.11847, -0.08657, -0.25705, -0.12835, -0.1625 ,  0.0929 ,  0.16287,  0.0916 ,


In [None]:
len(emb_szs)