In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model,load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

In [3]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return metrics.roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)
#     return metrics.roc_auc_score(y_true, y_pred)

In [4]:
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values+1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
        
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    y = layers.Dense(2, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=y)
    
    return model
    
    

In [38]:
# experiment
model = create_model(data, features)
model

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


<tensorflow.python.keras.engine.training.Model at 0x7fc3c9c03860>

In [40]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [5]:
data_dir = '../Data/Cat-in-the-dat/'

In [6]:
pd.set_option('display.max_columns', 30)

In [7]:
train = pd.read_csv(f'{data_dir}train.csv')
test = pd.read_csv(f'{data_dir}test.csv')
sample = pd.read_csv(f'{data_dir}sample_submission.csv')

## A Bit of EDA

In [8]:
train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [9]:
train['target'].unique()

array([0, 1])

In [10]:
len(train)

600000

In [11]:
train['day'].nunique()

7

In [12]:
len(sample) == len(test)

True

## Back to training loop

In [None]:
test

test['target'] = -1
data = pd.concat([train, test]).reset_index(drop=True)

features = [x for x in train.columns if x not in ['id', 'target']]

for feat in features:
    label_enc = preprocessing.LabelEncoder()
    data[feat] = label_enc.fit_transform(data[feat].fillna('-1').
                                         astype(str).values)

data.nom_6.nunique()

In [None]:
test['target'] = -1
data = pd.concat([train, test]).reset_index(drop=True)

features = [x for x in train.columns if x not in ['id', 'target']]

for feat in features:
    label_enc = preprocessing.LabelEncoder()
    data[feat] = label_enc.fit_transform(data[feat].fillna('-1').
                                         astype(str).values)

In [None]:
len(train.columns)

In [None]:
len(test.columns)

In [None]:
len(data.columns)

In [None]:
len(train), len(test), len(data)

In [23]:
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

test_data = [test.loc[:, features].values[:, k] for k in 
             range(test.loc[:, features].values.shape[1])]

In [33]:
# test.loc[:, features]

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,1,1,1,1,2,1,2,1,4,3,373,148,141,175,2197,3,5,1,6,21,147,3,12
1,1,1,1,1,2,3,1,5,6,1,485,838,98,5,1108,1,5,2,14,14,0,2,11
2,1,1,1,1,2,1,1,1,6,4,612,677,102,17,813,1,2,6,9,14,13,2,9
3,2,1,1,1,1,3,2,1,3,1,967,1049,190,108,997,1,2,4,13,2,1,1,9
4,1,1,2,1,2,3,1,0,4,4,1157,686,191,31,372,1,1,5,15,10,15,3,6
5,1,1,1,0,2,3,4,1,3,4,998,1488,88,103,425,3,3,2,5,25,166,5,9
6,1,1,1,1,2,3,5,1,6,1,1204,60,165,106,2060,2,5,5,14,21,61,1,9
7,1,1,2,1,1,3,5,1,5,2,1139,513,140,82,651,1,3,1,6,0,152,2,9
8,1,1,1,2,2,3,6,3,6,1,922,853,49,127,2007,2,4,6,3,25,143,5,8
9,1,1,1,1,1,2,2,5,3,1,848,114,58,30,122,2,5,4,11,13,104,7,5


In [36]:
((test.loc[:, features]).values[:, 0])

400000

In [35]:
(test.loc[:, features].values.shape)

(400000, 23)

In [None]:
# (train.target.values)

In [37]:
features

['bin_0',
 'bin_1',
 'bin_2',
 'bin_3',
 'bin_4',
 'nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9',
 'ord_0',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_5',
 'day',
 'month']

In [None]:
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))

skf = StratifiedKFold(n_splits=50)

for train_index, test_index in skf.split(train, train.target.values):
    X_train, X_test = train.iloc[train_index, :], train.iloc[test_index, :]
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train, y_test = X_train.target.values, X_test.target.values
    
    model = create_model(data, features)
    model.compile(loss='binary_crossentropy', optimizer='adam',
                  metrics = [auc])
    
    X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
    
    X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]
    
    
    es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001,
                                 patience=5, verbose=1, mode='max',
                                baseline=None, restore_best_weights=True)
    rlr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                    patience=3, min_lr=1e-06, mode='max',
                                    verbose=1)
    
    model.fit(X_train, utils.to_categorical(y_train), 
             validation_data=(X_test, utils.to_categorical(y_test)),
             verbose=1, batch_size=1024, callbacks=[es, rlr],
             epochs = 100)
    
    vaild_fold_preds = model.predict(X_test)[:, 1]
    test_fold_preds = model.predict(test_data)[:, 1]
    
    oof_preds[test_index] = valid_fold_preds.ravel()
    
    test_preds += test_fold_preds.ravel()
    
    print(metrics.roc_auc_score(y_test, valid_fold_preds))
    K.clear_session()

In [None]:
print('Overall AUC = {}'.format(metrics.roc_auc_score(train.target.values,
                                                     oof_preds)))

In [None]:
test_preds /= 50
test_ids = test.id.values
print("saving submisssion file")

submission = pd.DataFrame.from_dict({'id': test_ids,
                                    'target': test_preds})

submission.to_csv('submission.csv', index=False)