#  Categorical Feature Encoding Challenge II V1

In [1]:
import warnings
warnings.filterwarnings("ignore")


import os
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

pd.set_option("display.max_columns", 100)

In [2]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return metrics.roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

In [4]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 100))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.25)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(500, activation="relu")(x)
    x = layers.Dropout(0.25)(x)
    x = layers.BatchNormalization()(x)
        
    x = layers.Dense(500, activation="relu")(x)
    x = layers.Dropout(0.25)(x)
    x = layers.BatchNormalization()(x)
    
    y = layers.Dense(2, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=y)
    return model

In [5]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")
sample = pd.read_csv("./input/sample_submission.csv")

test["target"] = -1
data = pd.concat([train, test]).reset_index(drop=True)

In [6]:
data['ord_5_1']=data['ord_5'].str[0].str.lower()

data['ord_5_2']=data['ord_5'].str[1].str.lower()

data=data.drop(['ord_5'],axis=1)

In [9]:
from sklearn.impute import SimpleImputer
col=set(data.columns)-set(["id", "target"])

imp = SimpleImputer(strategy="most_frequent",verbose=2)
target=data.target

data=imp.fit_transform(data.drop(["id", "target"],axis=1))

In [8]:
features = [x for x in data.columns if x not in ["id", "target"]]

for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].fillna("-1").astype(str).values)

In [9]:
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)
test_data = [test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1])]

In [10]:
oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))
from keras.callbacks import ModelCheckpoint
NSPLIT=100
skf = StratifiedKFold(n_splits=NSPLIT)
for train_index, test_index in skf.split(train, train.target.values):
    X_train, X_test = train.iloc[train_index, :], train.iloc[test_index, :]
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train, y_test = X_train.target.values, X_test.target.values
    model = create_model(data, features)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])
    X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
    X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]
    
    es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                                 verbose=1, mode='max', baseline=None, restore_best_weights=True)

    rlr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                      patience=3, min_lr=1e-6, mode='max', verbose=1)
    
    checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5'
    checkpoint = ModelCheckpoint(checkpoint_name,monitor='val_loss',verbose = 1, save_best_only = True, mode ='auto')
    
    model.fit(X_train,
              utils.to_categorical(y_train),
              validation_data=(X_test, utils.to_categorical(y_test)),
              verbose=1,
              batch_size=1024,
              callbacks=[es, rlr, checkpoint],
              epochs=100
             )
    valid_fold_preds = model.predict(X_test)[:, 1]
    test_fold_preds = model.predict(test_data)[:, 1]
    oof_preds[test_index] = valid_fold_preds.ravel()
    test_preds += test_fold_preds.ravel()
    print(metrics.roc_auc_score(y_test, valid_fold_preds))
    K.clear_session()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Using TensorFlow backend.


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 594000 samples, validate on 6000 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.41251, saving model to Weights-001--0.41251.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 0.41251 to 0.40597, saving model to Weights-002--0.40597.hdf5
Epoch 3/100
Epoch 00003: val_loss improved from 0.40597 to 0.40474, saving model to Weights-003--0.40474.hdf5
Epoch 4/100
Epoch 00004: val_loss did not improve from 0.40474
Epoch 5/100
Epoch 00005: val_loss did not improve from 0.40474
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00006: val_loss did not improve from 0.40474
Epoch 7/100
Epoch 00007: val_loss did not improve from 0.40474
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.40474
Epoch 00008: ea

In [11]:
print("Overall AUC={}".format(metrics.roc_auc_score(train.target.values, oof_preds)))

Overall AUC=0.7772291232066377


In [12]:
test_preds /= NSPLIT
test_ids = test.id.values
print("Saving submission file")
submission = pd.DataFrame.from_dict({
    'id': test_ids,
    'target': test_preds
})
submission.to_csv("./input/submission.csv", index=False)

Saving submission file
