Predict Cat
Problem statement: Predict Cat given features with a lot categorical types

Type: Binary Classification

Performance metric: Area Under the ROC Curve (AUC score)

What I did in this notebook:

This is my 2nd notebook on this competiton. My previous notebook is experimenting with several categorical encoding and feature engineering could be found [here](https://www.kaggle.com/ajisamudra/experimenting-with-categorical-encoding).
1. Modelling using CatBoostClassifier
2. Modelling using Multi Layer Perceptron / Deep Learning

In [1]:
# Library
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
import category_encoders as cat_encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import History
from tensorflow.keras.callbacks import LearningRateScheduler

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# Ordinal Encoding for features ord_1, ord_2, ord_3, ord_4

def encode_ord_1(x):
    if x == "Novice":
        return 0
    elif x == "Contributor":
        return 1
    elif x == "Expert":
        return 2
    elif x == "Master":
        return 3
    elif x == "Grandmaster":
        return 4
    else:
        return -1
    

def encode_ord_2(x):
    if x == "Freezing":
        return 0
    elif x == "Cold":
        return 1
    elif x == "Warm":
        return 2
    elif x == "Hot":
        return 3
    elif x == "Boiling Hot":
        return 4
    elif x == "Lava Hot":
        return 5
    else:
        return -1

def encode_ord_3(x):
    if x == "a":
        return 0
    elif x == "b":
        return 1
    elif x == "c":
        return 2
    elif x == "d":
        return 3
    elif x == "e":
        return 4
    elif x == "f":
        return 5
    elif x == "g":
        return 6
    elif x == "h":
        return 7
    elif x == "i":
        return 8
    elif x == "j":
        return 9
    elif x == "k":
        return 10
    elif x == "l":
        return 11
    elif x == "m":
        return 12
    elif x == "n":
        return 13
    elif x == "o":
        return 14
    elif x == "p":
        return 15
    elif x == "q":
        return 16
    elif x == "r":
        return 17
    elif x == "s":
        return 18
    elif x == "t":
        return 19
    elif x == "u":
        return 20
    elif x == "v":
        return 21
    elif x == "w":
        return 22
    elif x == "x":
        return 23
    elif x == "y":
        return 24
    elif x == "z":
        return 25
    else:
        return -1
    
# Label encoding for bin_3 and bin_4
def encode_bin_3(x):
    if x == "T":
        return 1
    elif x == "F":
        return 0
    else:
        return -1

def encode_bin_4(x):
    if x == "Y":
        return 1
    elif x == "N":
        return 0
    else:
        return -1

In [3]:
# Read file
train = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/train.csv')
test = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/test.csv')

# Drop id on train and test dataset
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

# Select only baseline features
features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'day', 'month', 'nom_0',
                     'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
                     'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']

train_target = train['target']
train = train[features]
test = test[features]

# Replacing nom_6 value 'a885aacec' in test_dataset with 'missing_value' because the value is not seen at training data
test.loc[test.nom_6 == "a885aacec", 'nom_6'] = "missing_value"

# Label encode first bin_3 and bin_4
train['bin_3'] = train.bin_3.apply(lambda x: encode_bin_3(x))
train['bin_4'] = train.bin_4.apply(lambda x: encode_bin_4(x))
test['bin_3'] = test.bin_3.apply(lambda x: encode_bin_3(x))
test['bin_4'] = test.bin_4.apply(lambda x: encode_bin_4(x))

# Impute with constant
columns = train.columns

for i in columns:
    
    if train[i].dtype == object:
        imputer = SimpleImputer(missing_values=np.nan, strategy= 'constant', add_indicator= True)
    else:
        imputer = SimpleImputer(missing_values=np.nan, strategy= 'constant', fill_value= -1, add_indicator= True)

    imputer.fit(train[i].to_numpy().reshape(-1,1))
    
    train[i] = imputer.transform(train[i].to_numpy().reshape(-1,1))
    test[i] = imputer.transform(test[i].to_numpy().reshape(-1,1))

In [4]:
# Feature Engineering
# Create interactions
train['nom_1_nom_2'] = train.nom_1 + "_" + train.nom_2
train['nom_1_nom_3'] = train.nom_1 + "_" + train.nom_3
train['nom_1_nom_4'] = train.nom_1 + "_" + train.nom_4
train['nom_2_nom_3'] = train.nom_2 + "_" + train.nom_3
train['nom_2_nom_4'] = train.nom_2 + "_" + train.nom_4
train['nom_3_nom_4'] = train.nom_3 + "_" + train.nom_4
test['nom_1_nom_2'] = test.nom_1 + "_" + test.nom_2
test['nom_1_nom_3'] = test.nom_1 + "_" + test.nom_3
test['nom_1_nom_4'] = test.nom_1 + "_" + test.nom_4
test['nom_2_nom_3'] = test.nom_2 + "_" + test.nom_3
test['nom_2_nom_4'] = test.nom_2 + "_" + test.nom_4
test['nom_3_nom_4'] = test.nom_3 + "_" + test.nom_4

# Interaction Exp 3
train['bin_all_sum'] = train.bin_0 + train.bin_1 + train.bin_2 + train.bin_3 + train.bin_4
train['bin_all_mul'] = train.bin_0 * train.bin_1 * train.bin_2 * train.bin_3 * train.bin_4
test['bin_all_sum'] = test.bin_0 + test.bin_1 + test.bin_2 + test.bin_3 + test.bin_4
test['bin_all_mul'] = test.bin_0 * test.bin_1 * test.bin_2 * test.bin_3 * test.bin_4

# Create cyclical features from day and month
train['day_sin7'] = np.sin(2*np.pi*train['day']/7)
train['day_sin14'] = np.sin(2*np.pi*train['day']/14)
train['day_sin14add'] = np.sin(2*np.pi*train['day']/14)*3.5
train['month_sin12'] = np.sin(2*np.pi*train['month']/12)
train['month_sin24'] = np.sin(2*np.pi*train['month']/24)
train['month_sin24_add'] = np.sin(2*np.pi*train['month']/24)*6
test['day_sin7'] = np.sin(2*np.pi*test['day']/7)
test['day_sin14'] = np.sin(2*np.pi*test['day']/14)
test['day_sin14add'] = np.sin(2*np.pi*test['day']/14)*3.5
test['month_sin12'] = np.sin(2*np.pi*test['month']/12)
test['month_sin24'] = np.sin(2*np.pi*test['month']/24)
test['month_sin24_add'] = np.sin(2*np.pi*test['month']/24)*6

# Ordinal Encoding
train['ord_1'] = train.ord_1.apply(lambda x: encode_ord_1(x))
train['ord_2'] = train.ord_2.apply(lambda x: encode_ord_2(x))
train['ord_3'] = train.ord_3.apply(lambda x: encode_ord_3(x))
train['ord_4'] = train.ord_4.str.lower().apply(lambda x: encode_ord_3(x))
test['ord_1'] = test.ord_1.apply(lambda x: encode_ord_1(x))
test['ord_2'] = test.ord_2.apply(lambda x: encode_ord_2(x))
test['ord_3'] = test.ord_3.apply(lambda x: encode_ord_3(x))
test['ord_4'] = test.ord_4.str.lower().apply(lambda x: encode_ord_3(x))

# Ordinal Encoding Square
# First normalize with maximum label for faster convergence,
# Subtract with 0.5 and square it
train['ord_1_sqr_mid'] = ((train.ord_1 / 4) - 0.5)**2
train['ord_2_sqr_mid'] = ((train.ord_2 / 5) - 0.5)**2
train['ord_3_sqr_mid'] = ((train.ord_3 / 25) - 0.5)**2
train['ord_4_sqr_mid'] = ((train.ord_4 / 25) - 0.5)**2
test['ord_1_sqr_mid'] = ((test.ord_1 / 4) - 0.5)**2
test['ord_2_sqr_mid'] = ((test.ord_2 / 5) - 0.5)**2
test['ord_3_sqr_mid'] = ((test.ord_3 / 25) - 0.5)**2
test['ord_4_sqr_mid'] = ((test.ord_4 / 25) - 0.5)**2
# Square bot not centered
train['ord_1_sqr'] = ((train.ord_1 / 4))**2
train['ord_2_sqr'] = ((train.ord_2 / 5))**2
train['ord_3_sqr'] = ((train.ord_3 / 25))**2
train['ord_4_sqr'] = ((train.ord_4 / 25))**2
test['ord_1_sqr'] = ((test.ord_1 / 4))**2
test['ord_2_sqr'] = ((test.ord_2 / 5))**2
test['ord_3_sqr'] = ((test.ord_3 / 25))**2
test['ord_4_sqr'] = ((test.ord_4 / 25))**2
# Log Transform
train['ord_1_log'] = np.log1p((train.ord_1 / 4))
train['ord_2_log'] = np.log1p((train.ord_2 / 5))
train['ord_3_log'] = np.log1p((train.ord_3 / 25))
train['ord_4_log'] = np.log1p((train.ord_4 / 25))
test['ord_1_log'] = np.log1p((test.ord_1 / 4))
test['ord_2_log'] = np.log1p((test.ord_2 / 5))
test['ord_3_log'] = np.log1p((test.ord_3 / 25))
test['ord_4_log'] = np.log1p((test.ord_4 / 25))

# Day & Month centered square
train['day_sqr_mid'] = ((train.day / 7) - 0.5)**2
train['month_sqr_mid'] = ((train.month / 12) - 0.5)**2
test['day_sqr_mid'] = ((test.day / 7) - 0.5)**2
test['month_sqr_mid'] = ((test.month / 12) - 0.5)**2

# Day & Month not-centered square
train['day_sqr_mid'] = ((train.day / 7))**2
train['month_sqr_mid'] = ((train.month / 12))**2
test['day_sqr_mid'] = ((test.day / 7))**2
test['month_sqr_mid'] = ((test.month / 12))**2

# Day & Month Log transform
train['day_log'] = np.log1p((train.day / 7))
train['month_log'] = np.log1p((train.month / 12))
test['day_log'] = np.log1p((test.day / 7))
test['month_log'] = np.log1p((test.month / 12))

# Ord 1-4 * day and month
train['month_ord_1'] = train.ord_1 * train.month
train['month_ord_2'] = train.ord_2 * train.month
train['month_ord_3'] = train.ord_3 * train.month
train['month_ord_4'] = train.ord_4 * train.month
test['month_ord_1'] = test.ord_1 * test.month
test['month_ord_2'] = test.ord_2 * test.month
test['month_ord_3'] = test.ord_3 * test.month
test['month_ord_4'] = test.ord_4 * test.month
train['day_ord_1'] = train.ord_1 * train.day
train['day_ord_2'] = train.ord_2 * train.day
train['day_ord_3'] = train.ord_3 * train.day
train['day_ord_4'] = train.ord_4 * train.day
test['day_ord_1'] = test.ord_1 * test.day
test['day_ord_2'] = test.ord_2 * test.day
test['day_ord_3'] = test.ord_3 * test.day
test['day_ord_4'] = test.ord_4 * test.day

# Centered Squared
train['month_ord_1_sqr_mid'] = ((train.month_ord_1 / train.month_ord_1.max() ) - 0.5)**2
train['month_ord_2_sqr_mid'] = ((train.month_ord_2 / train.month_ord_2.max() ) - 0.5)**2
train['month_ord_3_sqr_mid'] = ((train.month_ord_3 / train.month_ord_3.max() ) - 0.5)**2
train['month_ord_4_sqr_mid'] = ((train.month_ord_4 / train.month_ord_4.max() ) - 0.5)**2
test['month_ord_1_sqr_mid'] = ((test.month_ord_1 / train.month_ord_1.max()) - 0.5)**2
test['month_ord_2_sqr_mid'] = ((test.month_ord_2 / train.month_ord_2.max()) - 0.5)**2
test['month_ord_3_sqr_mid'] = ((test.month_ord_3 / train.month_ord_3.max()) - 0.5)**2
test['month_ord_4_sqr_mid'] = ((test.month_ord_4 / train.month_ord_4.max()) - 0.5)**2

# Not Centered Squared
train['month_ord_1_sqr'] = ((train.month_ord_1 / train.month_ord_1.max() ) )**2
train['month_ord_2_sqr'] = ((train.month_ord_2 / train.month_ord_2.max() ) )**2
train['month_ord_3_sqr'] = ((train.month_ord_3 / train.month_ord_3.max() ) )**2
train['month_ord_4_sqr'] = ((train.month_ord_4 / train.month_ord_4.max() ) )**2
test['month_ord_1_sqr'] = ((test.month_ord_1 / train.month_ord_1.max()) )**2
test['month_ord_2_sqr'] = ((test.month_ord_2 / train.month_ord_2.max()) )**2
test['month_ord_3_sqr'] = ((test.month_ord_3 / train.month_ord_3.max()) )**2
test['month_ord_4_sqr'] = ((test.month_ord_4 / train.month_ord_4.max()) )**2

# Log
train['month_ord_1_log'] = np.log1p(train.month_ord_1 / train.month_ord_1.max())
train['month_ord_2_log'] = np.log1p(train.month_ord_2 / train.month_ord_2.max())
train['month_ord_3_log'] = np.log1p(train.month_ord_3 / train.month_ord_3.max())
train['month_ord_4_log'] = np.log1p(train.month_ord_4 / train.month_ord_4.max())
test['month_ord_1_log'] = np.log1p(test.month_ord_1 / train.month_ord_1.max())
test['month_ord_2_log'] = np.log1p(test.month_ord_2 / train.month_ord_2.max())
test['month_ord_3_log'] = np.log1p(test.month_ord_3 / train.month_ord_3.max())
test['month_ord_4_log'] = np.log1p(test.month_ord_4 / train.month_ord_4.max())

# Centered Squared
train['day_ord_1_sqr_mid'] = ((train.day_ord_1 / train.day_ord_1.max() ) - 0.5)**2
train['day_ord_2_sqr_mid'] = ((train.day_ord_2 / train.day_ord_2.max() ) - 0.5)**2
train['day_ord_3_sqr_mid'] = ((train.day_ord_3 / train.day_ord_3.max() ) - 0.5)**2
train['day_ord_4_sqr_mid'] = ((train.day_ord_4 / train.day_ord_4.max() ) - 0.5)**2
test['day_ord_1_sqr_mid'] = ((test.day_ord_1 / train.day_ord_1.max()) - 0.5)**2
test['day_ord_2_sqr_mid'] = ((test.day_ord_2 / train.day_ord_2.max()) - 0.5)**2
test['day_ord_3_sqr_mid'] = ((test.day_ord_3 / train.day_ord_3.max()) - 0.5)**2
test['day_ord_4_sqr_mid'] = ((test.day_ord_4 / train.day_ord_4.max()) - 0.5)**2

# Not Centered Squared
train['day_ord_1_sqr'] = ((train.day_ord_1 / train.day_ord_1.max() ) )**2
train['day_ord_2_sqr'] = ((train.day_ord_2 / train.day_ord_2.max() ) )**2
train['day_ord_3_sqr'] = ((train.day_ord_3 / train.day_ord_3.max() ) )**2
train['day_ord_4_sqr'] = ((train.day_ord_4 / train.day_ord_4.max() ) )**2
test['day_ord_1_sqr'] = ((test.day_ord_1 / train.day_ord_1.max()) )**2
test['day_ord_2_sqr'] = ((test.day_ord_2 / train.day_ord_2.max()) )**2
test['day_ord_3_sqr'] = ((test.day_ord_3 / train.day_ord_3.max()) )**2
test['day_ord_4_sqr'] = ((test.day_ord_4 / train.day_ord_4.max()) )**2

# Log
train['day_ord_1_log'] = np.log1p(train.day_ord_1 / train.day_ord_1.max())
train['day_ord_2_log'] = np.log1p(train.day_ord_2 / train.day_ord_2.max())
train['day_ord_3_log'] = np.log1p(train.day_ord_3 / train.day_ord_3.max())
train['day_ord_4_log'] = np.log1p(train.day_ord_4 / train.day_ord_4.max())
test['day_ord_1_log'] = np.log1p(test.day_ord_1 / train.day_ord_1.max())
test['day_ord_2_log'] = np.log1p(test.day_ord_2 / train.day_ord_2.max())
test['day_ord_3_log'] = np.log1p(test.day_ord_3 / train.day_ord_3.max())
test['day_ord_4_log'] = np.log1p(test.day_ord_4 / train.day_ord_4.max())


# Update columns
columns = train.columns

# Target Encoding to only object features (using target)
for i in columns:
    if train[i].dtype == object:
        target_encoder = cat_encoder.TargetEncoder(smoothing = 0.1)
        target_encoder.fit(train[i], train_target)
        train[i+"_target"] = target_encoder.transform(train[i])
        test[i+"_target"] = target_encoder.transform(test[i])

# Target encoder for day and month
target_encoder = cat_encoder.TargetEncoder(smoothing = 0.1)
target_encoder.fit(train[['day']], train_target)
train['day_target'] = target_encoder.transform(train['day'])
test['day_target'] = target_encoder.transform(test['day'])

target_encoder = cat_encoder.TargetEncoder(smoothing = 0.1)
target_encoder.fit(train[['month']], train_target)
train['month_target'] = target_encoder.transform(train['month'])
test['month_target'] = target_encoder.transform(test['month'])

# Update columns
columns = train.columns

# Label Encoding to only object features
for i in columns:
    if train[i].dtype == object:        
        label_encoder = LabelEncoder()
        label_encoder.fit(train[i])
        train[i] = label_encoder.transform(train[i])
        test[i] = label_encoder.transform(test[i])

# Create ordinal square after label encoding - centered
train['ord_5_sqr_mid'] = ((train.ord_5 / train.ord_5.max()) - 0.5)**2
test['ord_5_sqr_mid'] = ((test.ord_5 / train.ord_5.max()) - 0.5)**2
# Create ordinal square after label encoding - not centered
train['ord_5_sqr'] = ((train.ord_5 / train.ord_5.max()) )**2
test['ord_5_sqr'] = ((test.ord_5 / train.ord_5.max()) )**2
# Log transform
train['ord_5_log'] = np.log1p((train.ord_5 / train.ord_5.max()))
test['ord_5_log'] = np.log1p((test.ord_5 / train.ord_5.max()))

# Ord_5 * day & month
train['month_ord_5'] = train.ord_5 * train.month
train['day_ord_5'] = train.ord_5 * train.day
test['month_ord_5'] = test.ord_5 * test.month
test['day_ord_5'] = test.ord_5 * test.day

# Centered squared
train['month_ord_5_sqr_mid'] = ((train.month_ord_5 / train.month_ord_5.max()) - 0.5)**2
test['month_ord_5_sqr_mid'] = ((test.month_ord_5 / train.month_ord_5.max()) - 0.5)**2
train['day_ord_5_sqr_mid'] = ((train.day_ord_5 / train.day_ord_5.max()) - 0.5)**2
test['day_ord_5_sqr_mid'] = ((test.day_ord_5 / train.day_ord_5.max()) - 0.5)**2

# Not centered squared
train['month_ord_5_sqr'] = ((train.month_ord_5 / train.month_ord_5.max()) )**2
test['month_ord_5_sqr'] = ((test.month_ord_5 / train.month_ord_5.max()) )**2
train['day_ord_5_sqr'] = ((train.day_ord_5 / train.day_ord_5.max()) )**2
test['day_ord_5_sqr'] = ((test.day_ord_5 / train.day_ord_5.max()) )**2

# Log
train['month_ord_5_log'] = np.log1p((train.month_ord_5 / train.month_ord_5.max()))
test['month_ord_5_log'] = np.log1p((test.month_ord_5 / train.month_ord_5.max()))
train['day_ord_5_log'] = np.log1p((train.day_ord_5 / train.day_ord_5.max()))
test['day_ord_5_log'] = np.log1p((test.day_ord_5 / train.day_ord_5.max()))


# nom9, nom_8, nom_7, nom_6, nom_5, nom_4, nom_3, nom_2, nom_1 * day & month
noms = ['nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
for i in noms:
    train["month_"+i] = train[i] * train.month
    train["day_"+i] = train[i] * train.day
    test["month_"+i] = test[i] * train.month
    test["day_"+i] = test[i] * test.day

def centered_square(x, xmax):
    return ( (x / xmax)-0.5 )**2

def square(x, xmax):
    return ( (x / xmax))**2

def transform_log(x, xmax):
    return np.log1p((x / xmax))

month_noms = ['month_nom_1', 'month_nom_2', 'month_nom_3', 'month_nom_4', 'month_nom_5', 'month_nom_6', 'month_nom_7', 'month_nom_8', 'month_nom_9']
day_noms = ['day_nom_1', 'day_nom_2', 'day_nom_3', 'day_nom_4', 'day_nom_5', 'day_nom_6', 'day_nom_7', 'day_nom_8', 'day_nom_9']

# Centered Square
for i in month_noms:
    train[i+"_sqr_mid"] = centered_square( train[i], train[i].max())
    test[i+"_sqr_mid"] = centered_square( test[i], train[i].max())

for i in day_noms:
    train[i+"_sqr_mid"] = centered_square( train[i], train[i].max())
    test[i+"_sqr_mid"] = centered_square( test[i], train[i].max())
    
# Square
for i in month_noms:
    train[i+"_sqr"] = square( train[i], train[i].max())
    test[i+"_sqr"] = square( test[i], train[i].max())

for i in day_noms:
    train[i+"_sqr"] = square( train[i], train[i].max())
    test[i+"_sqr"] = square( test[i], train[i].max())

# Log
for i in month_noms:
    train[i+"_log"] = transform_log( train[i], train[i].max())
    test[i+"_log"] = transform_log( test[i], train[i].max())

for i in day_noms:
    train[i+"_log"] = transform_log( train[i], train[i].max())
    test[i+"_log"] = transform_log( test[i], train[i].max())

# bin_0, bin_1, bin_2, bin_3, bin_4 * day & month
bins = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
for i in bins:
    train["month_"+i] = train[i] * train.month
    train["day_"+i] = train[i] * train.day
    test["month_"+i] = test[i] * train.month
    test["day_"+i] = test[i] * test.day

month_bins = ['month_bin_0', 'month_bin_1', 'month_bin_2', 'month_bin_3', 'month_bin_4']
day_bins = ['day_bin_0', 'day_bin_1', 'day_bin_2', 'day_bin_3', 'day_bin_4']

# Centered Square
for i in month_bins:
    train[i+"_sqr_mid"] = centered_square( train[i], train[i].max())
    test[i+"_sqr_mid"] = centered_square( test[i], train[i].max())

for i in day_bins:
    train[i+"_sqr_mid"] = centered_square( train[i], train[i].max())
    test[i+"_sqr_mid"] = centered_square( test[i], train[i].max())
    
# Square
for i in month_bins:
    train[i+"_sqr"] = square( train[i], train[i].max())
    test[i+"_sqr"] = square( test[i], train[i].max())

for i in day_bins:
    train[i+"_sqr"] = square( train[i], train[i].max())
    test[i+"_sqr"] = square( test[i], train[i].max())

# Doing decomposition only to 68 features from earlier
pca = PCA(n_components = 2)
pca.fit(train.iloc[:, 0:67])
decom_X_train = pca.transform(train.iloc[:, 0:67])
decom_X_test = pca.transform(test.iloc[:,0:67])

# Clustering and do target encoder on the target
kmeans = KMeans(n_clusters = 2, random_state = 41)
kmeans.fit(decom_X_train)
train_cluster = kmeans.predict(decom_X_train)
test_cluster = kmeans.predict(decom_X_test)

train = pd.concat([ train,  pd.DataFrame(train_cluster, columns = ['cluster']) ], axis = 1)
test = pd.concat([ test,  pd.DataFrame(test_cluster, columns = ['cluster'])], axis = 1)

# Target encoder for cluster
target_encoder = cat_encoder.TargetEncoder(smoothing = 0.1)
target_encoder.fit(train[['cluster']], train_target)
train['cluster_target'] = target_encoder.transform(train['cluster'])
test['cluster_target'] = target_encoder.transform(test['cluster'])

# Update columns
columns = train.columns

# Standardize the values
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [5]:
# # Before experimenting with undersampling and LGBM, we need to split training for validation
# X_train, X_val, y_train, y_val = train_test_split(train,
#                                                   train_target,
#                                                   test_size = 0.2,
#                                                   stratify = train_target,
#                                                   random_state = 41)

In [6]:
# def evaluate_auc(model, X_train, X_val, y_train, y_val):
#     model.fit(X=X_train,y =y_train)
#     y_pred = model.predict_proba(X_val)
    
#     score = roc_auc_score(y_val, y_pred[:,1])
#     print("AUC Score on Validation: {}".format(score))
    
# # tuned_lgbm = LGBMClassifier(n_estimators = 300, lambda_l1 = 0.5, learning_rate = 0.1, num_leaves = 6, max_depth = 5,random_state = 41)
# # evaluate_auc(tuned_lgbm, X_train, X_val, y_train, y_val)
# # AUC Score on Validation: 0.7955373141802544 -> all features 217
# # AUC Score on Validation: 0.7955373141802544 -> Adding cluster target
# # AUC Score on Validation: 0.7957226405432072 -> selectfrommodel
# # AUC Score on Validation: 0.7956081111848645 -> applying lambda_l1

In [7]:
# plain_logreg = LogisticRegression(random_state = 41)
# evaluate_auc(plain_logreg, X_train_new, X_val_new, y_train, y_val)
# # AUC Score on Validation: 0.7909603468202064 -> all features 217
# # AUC Score on Validation: 0.7909445152179959 Adding cluster target 
# # AUC Score on Validation: 0.786667054899717 -> selectfrommodel

In [8]:
# # Correlation Matrix
# cor_mtx = pd.concat([ pd.DataFrame(train, columns = columns), train_target], axis = 1).corr()
# print("Pearson correlation coefficient:")

# cor_mtx[ (cor_mtx.target >= 0.08) | (cor_mtx.target <= -0.08)]['target'].sort_values(ascending = False)

In [9]:
# # PCA for dimensional reduction

# # Lets build PCA that will retain 95% variance on the data
# pca = PCA(n_components=0.99)
# pca.fit(train)

In [10]:
# decom_train = pca.transform(train)

In [11]:
# #Plotting the Cumulative Summation of the Explained Variance
# plt.figure()
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('Number of Components')
# plt.ylabel('Variance (%)') #for each component
# plt.title('Data Explained Variance')
# plt.axhline(y=0.95, color = 'r', linestyle='--')
# plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize = 12)
# plt.show()

# # Based on the picture, after 70 components there are no significant increment of vairance % of total data
# # I decide to only use 70 components

In [12]:
# # Tuned Logistic Regression
# # Training model
# logit = LogisticRegression(C=0.05, solver="lbfgs", max_iter=5000)
# logit.fit(train, train_target)

# # Predict
# y_pred = logit.predict_proba(test)

In [13]:
# # Make file for submission
# exp6_submission = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/sample_submission.csv')
# exp6_submission['target'] = y_pred[:,1]
# exp6_submission.to_csv('tuned_logreg_model.csv', index=False)

### See whether we have cluster or not in data

In [14]:
# # Define RandomUnderSampler
# under_sampler = RandomUnderSampler(sampling_strategy = 1, random_state = 41)
# X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)

In [15]:
# tuned_lgbm = LGBMClassifier(n_estimators = 300, learning_rate = 0.1, num_leaves = 6, max_depth = 5,random_state = 41)
# evaluate_auc(tuned_lgbm, X_train_resampled, X_val, y_train_resampled, y_val)

In [16]:
# # Doing decomposition only to 68 features from earlier
# pca = PCA(n_components = 2)
# pca.fit(X_train_resampled[:, 0:67])
# decom_X_train = pca.transform(X_train_resampled[:, 0:67])
# decom_X_val = pca.transform(X_val[:,0:67])
# decom_X_train.shape , decom_X_val.shape

In [17]:
# # Plot scatter on reduced dimensionality
# decom_data = pd.concat( [pd.DataFrame(decom_X) , y_train_resampled], axis = 1 )
# sns.pairplot(data= decom_data, hue = 'target')

In [18]:
# from sklearn.cluster import KMeans
# kmeans = KMeans(n_clusters = 2, random_state = 41)
# kmeans.fit(decom_X_train)
# train_cluster = kmeans.predict(decom_X_train)
# val_cluster = kmeans.predict(decom_X_val)

# X_train_resampled = pd.concat([ pd.DataFrame(X_train_resampled, columns = columns), pd.DataFrame(train_cluster, columns = ['cluster']) ], axis = 1)
# X_val = pd.concat([ pd.DataFrame(X_val, columns = columns), pd.DataFrame(val_cluster, columns = ['cluster'])], axis = 1)

# # Target encoder for day and month
# target_encoder = cat_encoder.TargetEncoder(smoothing = 0.1)
# target_encoder.fit(X_train_resampled[['cluster']], y_train_resampled)
# X_train_resampled['cluster_target'] = target_encoder.transform( (X_train_resampled['cluster']).astype(int) )
# X_val['cluster_target'] = target_encoder.transform((X_val['cluster']).astype(int))

# columns = X_train_resampled.columns

# # Standardize the values
# scaler = StandardScaler()
# X_train_resampled = scaler.fit_transform(X_train_resampled)
# X_val = scaler.transform(X_val)

In [19]:
# # Evaluate with additional cluster information
# tuned_lgbm = LGBMClassifier(n_estimators = 300, learning_rate = 0.1, num_leaves = 6, max_depth = 5,random_state = 41)
# evaluate_auc(tuned_lgbm, X_train_resampled, X_val, y_train_resampled, y_val)

In [20]:
# # Plot scatter on reduced dimensionality
# decom_data = pd.concat( [pd.DataFrame(decom_X_train, columns = ['pca_1', 'pca_2']) , y_train_resampled, pd.Series(train_cluster)], axis = 1 )
# sns.pairplot(data= decom_data, hue = 'target')

### Undersampling

In [21]:
# # Number of positive labels on training data
# train_target.value_counts()

# # The idea of undersampling is we will eliminate the majority class which is the negative labels
# # So it will have the same number of samples with positive label

In [22]:
# # Define RandomUnderSampler
# under_sampler = RandomUnderSampler(sampling_strategy = 1, random_state = 41)
# X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)

In [23]:
# How will the score if we only train using resamples training data
# evaluate_auc(logit, X_train_resampled, X_val, y_train_resampled, y_val)
# Wow it's quite the same (0.7877)
# We will use under-sampling strategy to fasten our experimentation to find the best model

### Experimenting with LGBM Model

In [24]:
# # Plain LGBM Classifier
# plain_lgbm = LGBMClassifier(random_state = 41)
# evaluate_auc(plain_lgbm, X_train_resampled, X_val, y_train_resampled, y_val)

# # With only plain LGBM we could improve auc validation from 0.7877 to 0.7926
# # After adding to ~ 200 features auc 0.7927

In [25]:
# # Lets try to find best hyperparameter for LGBM
# # These are three important hyperparameter for structure the model (Phase 1 hyperparamter tuning)

# param_grid = {'num_leaves' : [16],
#               'max_depth': [6],
#               'lambda_l1': [0.005, 0.01, 2]}

# scv = StratifiedKFold(n_splits = 3, random_state = 41)

# grid_searcher = GridSearchCV(estimator = plain_lgbm, scoring = "roc_auc" , param_grid = param_grid, cv = scv, verbose=1, n_jobs = -1 )
# grid_searcher.fit(X_train_resampled, y_train_resampled)

In [26]:
# grid_searcher.best_params_, grid_searcher.best_score_
# # 200 features
# # ({'max_depth': 3, 'n_estimators': 400, 'num_leaves': 5}, 0.7951495088651278)

# # After adding target clusters
# # ({'lambda_l1': 0.01, 'max_depth': 4, 'num_leaves': 7}, 0.7873061596306101)
# # ({'lambda_l1': 0.01, 'max_depth': 6, 'num_leaves': 10}, 0.7895831487743887)
# # ({'lambda_l1': 0.01, 'max_depth': 6, 'num_leaves': 16}, 0.7905603566960145)
# # ({'lambda_l1': 0.005, 'max_depth': 6, 'num_leaves': 16}, 0.7905883541727441)

In [27]:
# # Lets try to find best hyperparameter for LGBM
# # These are three important hyperparameter for convergence (Phase 2 hyperparamter tuning)

# tuned_lgbm = LGBMClassifier(num_leaves = 16, max_depth = 6, lambda_l1 = 0.005, random_state = 41)

# param_grid = {'learning_rate' : [0.11, 0.2, 0.3]}

# scv = StratifiedKFold(n_splits = 3, random_state = 41)

# grid_searcher = GridSearchCV(estimator = tuned_lgbm, scoring = "roc_auc" , param_grid = param_grid, cv = scv, verbose=1, n_jobs = -1 )
# grid_searcher.fit(X_train_resampled, y_train_resampled)

In [28]:
# grid_searcher.best_params_, grid_searcher.best_score_
# # ({'learning_rate': 0.1778279410038923}, 0.794386184358966)
# # ({'learning_rate': 0.1}, 0.7951577528090543)
# # Add Features 200
# # ({'learning_rate': 0.03162277660168379}, 0.7883761918789878)
# # ({'learning_rate': 0.1}, 0.7951495088651278)
# # ({'learning_rate': 0.11}, 0.7915211779314236)

In [29]:
# # LGBM with best hyperparameter
# tuned_lgbm = LGBMClassifier(n_estimators = 300, learning_rate = 0.1, num_leaves = 6, max_depth = 5,random_state = 41)

# print("Training on resampled training instances")
# # Training using resampled instances
# evaluate_auc(tuned_lgbm, X_train_resampled, X_val, y_train_resampled, y_val)
# print("Training on all training instances")
# # Training using all itraining instances
# evaluate_auc(tuned_lgbm, X_train, X_val, y_train, y_val)

# # It validates the grid search result
# # The validation score is quite the same ~ 0.796

In [30]:
# # Tried generate several prediction using several under sampling

# best_lgbm = LGBMClassifier(n_estimators = 300, learning_rate = 0.1, num_leaves = 6, max_depth = 5,random_state = 41)

# def predict_val_random_sample(model, i, X_train, X_val, y_train, y_val):
#     # Define RandomUnderSampler
#     print("Random State: {}".format(i))
#     under_sampler = RandomUnderSampler(sampling_strategy = 1, random_state = i)
#     X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)
#     # Evaluate AUC on validation dataset
#     evaluate_auc(model, X_train_resampled, X_val, y_train_resampled, y_val)
#     # Training model and predict
#     model.fit(X_train_resampled, y_train_resampled)
#     # Predict
#     y_pred = model.predict_proba(X_val)
#     return y_pred[:,1]

# # Initiate list for result several LGBM prediction
# d1 = pd.DataFrame()
# d2 = pd.DataFrame()
# d3 = pd.DataFrame()
# d4 = pd.DataFrame()
# dfs = [d1, d2, d3, d4]

# # Initiate several random state
# rands = [1, 30, 41, 70]

# for i in range(4):
#     dfs[i] = predict_val_random_sample(best_lgbm, rands[i], X_train, X_val, y_train, y_val)

In [31]:
# # See difference score on validation score by several model which are built on different dataset
# for i in dfs:    
#     score = roc_auc_score(y_val,i)
#     print("AUC Score on Validation: {}".format(score))

In [32]:
# # Ensemble the prediction with rank-based weighted averaging

# # Rank first all the predictions
# rank_1 = scipy.stats.rankdata(dfs[0], method = 'dense')
# rank_2 = scipy.stats.rankdata(dfs[1], method = 'dense')
# rank_3 = scipy.stats.rankdata(dfs[2], method = 'dense')
# rank_4 = scipy.stats.rankdata(dfs[3], method = 'dense')

# # Average the rank
# avg_rank = (0.2 * rank_1 + 0.2 * rank_2 + 0.3 * rank_3 + 0.3 * rank_4)

# # Scale the average rank to 0-1
# final_result = (avg_rank - avg_rank.min()) / (avg_rank.max() - avg_rank.min())

# score = roc_auc_score(y_val,final_result)
# print("AUC Score on Validation: {}".format(score))
# The result better than all 4 LGBM model. the score on validation is 0.79699

In [33]:
# rank_1[1], rank_2[1], rank_3[1], rank_4[1], avg_rank[1], final_result[1]

### Training Ensemble LGBM on all training dataset

In [34]:
# # LGBM with best hyperparameter
# tuned_lgbm = LGBMClassifier(n_estimators = 300, learning_rate = 0.1, num_leaves = 6, max_depth = 5,random_state = 41)

# # Training on all dataset and predict test
# def predict_test_random_sample(model, i, X_train, X_test, y_train):
#     # Define RandomUnderSampler
#     print("Random State: {}".format(i))
#     under_sampler = RandomUnderSampler(sampling_strategy = 1, random_state = i)
#     X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)
#     print("Start Training")
#     # Training model and predict
#     model.fit(X_train_resampled, y_train_resampled)
#     # Predict
#     print("Start Predicting")
#     y_pred = model.predict_proba(X_test)
#     return y_pred[:,1]

# # Initiate list for result several LGBM prediction
# d1 = pd.DataFrame()
# d2 = pd.DataFrame()
# d3 = pd.DataFrame()
# d4 = pd.DataFrame()
# dfs = [d1, d2, d3, d4]

# # Initiate several random state
# rands = [1, 30, 41, 70]

# # Training on all training dataset
# for i in range(4):
#     dfs[i] = predict_test_random_sample(tuned_lgbm, rands[i], train, test, train_target)

# # Ensemble the prediction with rank-based weighted averaging
# # Rank first all the predictions
# rank_1 = scipy.stats.rankdata(dfs[0], method = 'dense')
# rank_2 = scipy.stats.rankdata(dfs[1], method = 'dense')
# rank_3 = scipy.stats.rankdata(dfs[2], method = 'dense')
# rank_4 = scipy.stats.rankdata(dfs[3], method = 'dense')

# # Average the rank
# avg_rank = (0.2 * rank_1 + 0.2 * rank_2 + 0.3 * rank_3 + 0.3 * rank_4)

# # Scale the average rank to 0-1
# final_result = (avg_rank - avg_rank.min()) / (avg_rank.max() - avg_rank.min())

In [35]:
# # Make file for submission
# exp8_submission = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/sample_submission.csv')
# exp8_submission['target'] = final_result
# exp8_submission.to_csv('ensemble_lgbm_1.csv', index=False)

### Experimenting with Logistic Regression

In [36]:
# # See difference Logistic Regression performance on sampled and all training dataset

# plain_logreg = LogisticRegression(random_state = 41)
# print("Sampled training")
# evaluate_auc(plain_logreg, X_train_resampled, X_val, y_train_resampled, y_val)
# print("All training")
# evaluate_auc(plain_logreg, X_train, X_val, y_train, y_val)

# # The result is quite the same, we will use the sampled dataset for faster iteration on hypeparameter tuning
# # Plain LogReg AUC 0.7881

In [37]:
# # Lets try to find best hyperparameter for LogisticRegression
# # These are three important hyperparameter
# plain_logreg = LogisticRegression(random_state = 41)

# param_grid = {'C' : [50, 55, 60],
#               'solver': ["newton-cg"],
#               'max_iter': [50]}

# scv = StratifiedKFold(n_splits = 3, random_state = 41)

# grid_searcher = GridSearchCV(estimator = plain_logreg, scoring = "roc_auc" , param_grid = param_grid, cv = scv, verbose=1, n_jobs = -1 )
# grid_searcher.fit(X_train_resampled, y_train_resampled)

In [38]:
# grid_searcher.best_params_ , grid_searcher.best_score_
# # ({'C': 0.8, 'max_iter': 50, 'solver': 'newton-cg'}, 0.7874683296881111)
# # ({'C': 0.9, 'max_iter': 50, 'solver': 'newton-cg'}, 0.7875001099038554)
# # ({'C': 1.1, 'max_iter': 50, 'solver': 'newton-cg'}, 0.7875542822254484)
# # ({'C': 1.5, 'max_iter': 50, 'solver': 'newton-cg'}, 0.7876348764010777)
# # ({'C': 3, 'max_iter': 50, 'solver': 'newton-cg'}, 0.7877894925255164)
# # ({'C': 50, 'max_iter': 50, 'solver': 'newton-cg'}, 0.7879384137628845)

In [39]:
# # See the performance on tuned LogReg
# tuned_logreg = LogisticRegression(C = 50, max_iter = 50, solver = 'newton-cg', random_state = 41)

# print("Sampled training")
# evaluate_auc(tuned_logreg, X_train_resampled, X_val, y_train_resampled, y_val)
# print("All training")
# evaluate_auc(tuned_logreg, X_train, X_val, y_train, y_val)
# # Tuned logreg have AUC 0.7887

### Experimenting with CatBoost

In [40]:
# # See difference CatBoost performance on sampled and all training datasets
# plain_catb = CatBoostClassifier(iterations = 50, loss_function = "Logloss", eval_metric = "AUC", verbose = False)
# print("Sampled training")
# evaluate_auc(plain_catb, X_train_resampled, X_val, y_train_resampled, y_val)
# print("All training")
# evaluate_auc(plain_catb, X_train, X_val, y_train, y_val)

# # Plain CatBoost performance 0.795

In [41]:
# # Lets try to find best hyperparameter for CatBoost
# # These are three important hyperparameter
# plain_catb = CatBoostClassifier(iterations = 50, loss_function = "Logloss", eval_metric = "AUC", verbose = False)

# param_grid = {'depth' : [8, 13, 15],
#               'l2_leaf_reg': [1, 1.2, 1.5],
#               'learning_rate': [0.03, 0.1, 0.3]}

# scv = StratifiedKFold(n_splits = 3, random_state = 41)

# grid_searcher = GridSearchCV(estimator = plain_catb, scoring = "roc_auc" , param_grid = param_grid, cv = scv, verbose=1, n_jobs = -1 )
# grid_searcher.fit(X_train_resampled, y_train_resampled)

In [42]:
# grid_searcher.best_params_ , grid_searcher.best_score_
# # ({'depth': 8, 'l2_leaf_reg': 1, 'learning_rate': 0.1}, 0.7907046866963885)

In [43]:
# # See the performance on tuned CatBoost
# tuned_catb = CatBoostClassifier(iterations = 50, loss_function = "Logloss", eval_metric = "AUC", verbose = False)

# print("Sampled training")
# evaluate_auc(tuned_catb, X_train_resampled, X_val, y_train_resampled, y_val)
# print("All training")
# evaluate_auc(tuned_catb, X_train, X_val, y_train, y_val)

### Ensemble LGBM, Logistic Regression, and CatBoost on under-sampled training dataset

In [44]:
# # We will try to train several model:
# # 4 LGBM, 2 Logistic Regression, and 1 CatBoost
# # Ensemble all the results

# # Best LGBM Result on validation 0.796
# # Best Logistic Regression Result on validation 0.7887
# # Best CatBoost Result on validation 0.795

# # Validation phase

# tuned_lgbm = LGBMClassifier(learning_rate = 0.11, num_leaves = 16, max_depth = 6, lambda_l1 = 0.005,random_state = 41)
# tuned_logreg = LogisticRegression(C= 5, max_iter= 3000, solver= 'lbfgs', random_state = 41)
# tuned_catb = CatBoostClassifier(loss_function = "Logloss", eval_metric = "AUC", verbose = False)

# def predict_val_random_sample(model, i, X_train, X_val, y_train, y_val):
#     # Define RandomUnderSampler
#     print("Random State: {}".format(i))
#     under_sampler = RandomUnderSampler(sampling_strategy = 1, random_state = i)
#     X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)
#     # Evaluate AUC on validation dataset
#     evaluate_auc(model, X_train_resampled, X_val, y_train_resampled, y_val)
#     # Training model and predict
#     model.fit(X_train_resampled, y_train_resampled)
#     # Predict
#     y_pred = model.predict_proba(X_val)
#     return y_pred[:,1]

# def predict_val(model, X_train, X_val, y_train, y_val):
#     # Evaluate AUC on validation dataset
#     evaluate_auc(model, X_train, X_val, y_train, y_val)
#     # Training model and predict
#     model.fit(X_train, y_train)
#     # Predict
#     y_pred = model.predict_proba(X_val)
#     return y_pred[:,1]
    
# # Initiate list for result several predictions
# results = []

# # Initiate several random state
# rands = [1, 30, 41, 70]

# for i in range(len(rands)):
#     results.append(predict_val_random_sample(tuned_lgbm, rands[i], X_train, X_val, y_train, y_val))

# for i in range(len(rands)-2):
#     results.append(predict_val_random_sample(tuned_logreg, rands[i+2], X_train, X_val, y_train, y_val ))

# results.append(predict_val_random_sample(tuned_catb, rands[3], X_train, X_val, y_train, y_val ))

In [45]:
# # Train NN

# epochs = 30
# learning_rate = 0.001 # initial learning_rate
# decay_rate = 0.01
# momentum = 0.8

# batch_size = int(480000/100)

# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import History
# from tensorflow.keras.callbacks import LearningRateScheduler

# # Define model architecture

# mlp = keras.models.Sequential([
#     keras.layers.Dense(64, activation = 'relu', kernel_initializer = 'he_normal'),
#     keras.layers.Dropout(0.2),
#     keras.layers.Dense(64, activation = 'relu', kernel_initializer = 'he_normal'),
#     keras.layers.Dropout(0.2),
#     keras.layers.Dense(32, activation = 'relu', kernel_initializer = 'he_normal'),
#     keras.layers.Dropout(0.1),
#     keras.layers.Dense(1, activation = 'sigmoid')    
# ])

# def auc_metric(y_true, y_pred):
#     def fallback_auc(y_true, y_pred):
#         try:
#             return roc_auc_score(y_true, y_pred)
#         except:
#             return 0.5
#     return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)


# adam = Adam(learning_rate = learning_rate)

# # exponential decay for learning rate

# def exp_decay(epoch):
#     lrate = learning_rate * np.exp(-decay_rate*epoch)
#     return lrate

# # learning rate schedule callback
# loss_history = History()
# lr_rate = LearningRateScheduler(exp_decay)
# callbacks_list = [loss_history, lr_rate]

# # Compile mlp
# mlp.compile(optimizer = adam, loss= 'binary_crossentropy', metrics = [auc_metric])

# lr_model_history = mlp.fit(X_train, y_train.to_numpy(),
#         batch_size = batch_size,
#         epochs = epochs,
#         validation_data=(X_val, y_val),
#         callbacks = callbacks_list)

# results.append(mlp.predict(X_val))

In [46]:
# # Ensemble the prediction with rank-based weighted averaging

# # Rank all the predictions using dense method
# ranks = []
# for i in range(len(results)):
#     ranks.append( scipy.stats.rankdata( results[i] , method = 'dense') )

# # Weighted average ranking
# avg_rank = np.average( ranks, axis = 0, weights = [1, 1, 1, 1, 1, 1, 1, 1] )

# # Scale the average rank to 0-1
# final_result = (avg_rank - avg_rank.min()) / (avg_rank.max() - avg_rank.min())

# score = roc_auc_score(y_val,final_result)
# print("AUC Score on Validation: {}".format(score))
# # The result better 0.79726 than any other predictions 
# # Ensemble 4 models: 2 LGBM, 1 LogReg, 1 CatBoost 0.79745; weights = [3, 3, 1, 3]
# # Ensemble 4 models: 2 LGBM, 1 LogReg, 1 CatBoost, and 1 plain CatBoost 0.79767 ; weights = [3, 3, 1, 3, 3]
# # Revert to first ensemble
# # After adding to ~ 200 features AUC Score on Validation: 0.7969246358526689 weights = [3, 3, 3, 3, 1, 1, 2]
# # AUC Score on Validation: 0.7971906169472329 ; weights = [1, 1, 1, 1, 1, 1, 1]
# # AUC Score on Validation: 0.797321531961379
# # AUC Score on Validation: 0.7973375349368386 ; weights = [2, 2, 2, 1 , 1, 1, 2]

In [47]:
# We will try to train several model:
# 4 LGBM, 2 Logistic Regression, and 2 CatBoost
# Ensemble all the results

# Best LGBM Result on validation 0.796
# Best Logistic Regression Result on validation 0.7887
# Best CatBoost Result on validation 0.795

# Training all data phase
tuned_lgbm = LGBMClassifier(learning_rate = 0.11, num_leaves = 16, max_depth = 6, lambda_l1 = 0.005,random_state = 41)
tuned_logreg = LogisticRegression(C= 5, max_iter= 3000, solver= 'lbfgs', random_state = 41)
tuned_catb = CatBoostClassifier(loss_function = "Logloss", eval_metric = "AUC", verbose = False)

# Training on all dataset and predict test
def predict_test_random_sample(model, i, X_train, X_test, y_train):
    # Define RandomUnderSampler
    print("Random State: {}".format(i))
    under_sampler = RandomUnderSampler(sampling_strategy = 1, random_state = i)
    X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)
    print("Start Training")
    # Training model and predict
    model.fit(X_train_resampled, y_train_resampled)
    # Predict
    print("Start Predicting")
    y_pred = model.predict_proba(X_test)
    return y_pred[:,1]

def predict_test(model, X_train, X_test, y_train):
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    return y_pred[:,1]

# Initiate list for result several predictions
results = []

# Initiate several random state
rands = [1, 30, 41, 70]

# Training on all training dataset - LGBM
for i in range(len(rands)):
    results.append( predict_test_random_sample(tuned_lgbm, rands[i], train, test, train_target) )

for i in range(len(rands)-2):
    results.append( predict_test_random_sample(tuned_logreg, rands[i+1], train, test, train_target) )

results.append( predict_test_random_sample(tuned_catb, rands[2], train, test, train_target) )



Random State: 1
Start Training
Start Predicting
Random State: 30
Start Training
Start Predicting
Random State: 41
Start Training
Start Predicting
Random State: 70
Start Training
Start Predicting
Random State: 30
Start Training
Start Predicting
Random State: 41
Start Training
Start Predicting
Random State: 41
Start Training
Start Predicting


In [48]:
epochs = 30
learning_rate = 0.001 # initial learning_rate
decay_rate = 0.01
momentum = 0.8

batch_size = int(480000/100)

# exponential decay for learning rate
def exp_decay(epoch):
    lrate = learning_rate * np.exp(-decay_rate*epoch)
    return lrate

def auc_metric(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

mlp = keras.models.Sequential([
    keras.layers.Dense(64, activation = 'relu', kernel_initializer = 'he_normal'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(64, activation = 'relu', kernel_initializer = 'he_normal'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation = 'relu', kernel_initializer = 'he_normal'),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(1, activation = 'sigmoid')    
])

adam = Adam(learning_rate = learning_rate)

# learning rate schedule callback
loss_history = History()
lr_rate = LearningRateScheduler(exp_decay)
callbacks_list = [loss_history, lr_rate]

# Compile mlp
mlp.compile(optimizer = adam, loss= 'binary_crossentropy', metrics = [auc_metric])

# Training mlp
mlp.fit(train, train_target.to_numpy(),
        batch_size = batch_size,
        epochs = epochs,
        callbacks = callbacks_list,
        verbose = -1
       )

results.append( mlp.predict(test) )

Train on 600000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [49]:
# Ensemble the prediction with rank-based weighted averaging

# Rank all the predictions
ranks = []

for i in range(len(results)):
    ranks.append( scipy.stats.rankdata(results[i], method = 'dense') )

# Average the rank
avg_rank = np.average(ranks, axis = 0, weights = [1, 1, 1, 1, 1, 1, 1, 1] )

# Scale the average rank to 0-1
final_result = (avg_rank - avg_rank.min()) / (avg_rank.max() - avg_rank.min())


In [50]:
# Make file for submission
exp9_submission = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/sample_submission.csv')
exp9_submission['target'] = final_result
exp9_submission.to_csv('ensemble_lgbm_logreg_catboost.csv', index=False)

In [51]:
# # Best Model perform 0.7825
# # We will try to train several model:
# # 4 LGBM, 2 Logistic Regression, and 2 CatBoost
# # Ensemble all the results

# # Training all data phase

# tuned_lgbm = LGBMClassifier(n_estimators = 300, learning_rate = 0.1, num_leaves = 6, max_depth = 5,random_state = 41)
# tuned_logreg = LogisticRegression(random_state = 41)
# tuned_catb = CatBoostClassifier(loss_function = "Logloss", eval_metric = "AUC", verbose = False)

# # Training on all dataset and predict test
# def predict_test_random_sample(model, i, X_train, X_test, y_train):
#     # Define RandomUnderSampler
#     print("Random State: {}".format(i))
#     under_sampler = RandomUnderSampler(sampling_strategy = 1, random_state = i)
#     X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)
#     print("Start Training")
#     # Training model and predict
#     model.fit(X_train_resampled, y_train_resampled)
#     # Predict
#     print("Start Predicting")
#     y_pred = model.predict_proba(X_test)
#     return y_pred[:,1]

# def predict_test(model, X_train, X_test, y_train):
#     model.fit(X_train, y_train)
#     y_pred = model.predict_proba(X_test)
#     return y_pred[:,1]

# # Initiate list for result several predictions
# results = []

# # Initiate several random state
# rands = [1, 30, 41, 70]

# # Training on all training dataset - LGBM
# for i in range(len(rands)):
#     results.append( predict_test_random_sample(tuned_lgbm, rands[i], train, test, train_target) )

# for i in range(len(rands)-2):
#     results.append( predict_test_random_sample(tuned_logreg, rands[i+1], train, test, train_target) )

# results.append( predict_test_random_sample(tuned_catb, rands[2], train, test, train_target) )


# # Ensemble the prediction with rank-based weighted averaging

# # Rank all the predictions
# ranks = []

# for i in range(len(results)):
#     ranks.append( scipy.stats.rankdata(results[i], method = 'dense') )

# # Average the rank
# avg_rank = np.average(ranks, axis = 0, weights = [4, 4, 4, 4, 1, 1, 4] )

# # Scale the average rank to 0-1
# final_result = (avg_rank - avg_rank.min()) / (avg_rank.max() - avg_rank.min())


### CatBoost Model
CatBoost is a gradient boosting solution which is considered as the state of the art for tabular data with categorical variables.

In [52]:
# # Define CatBoost model - without onehot encoding features
# catb = CatBoostClassifier(iterations = 100 , learning_rate = 0.3, loss_function= "Logloss", eval_metric = "AUC", verbose=False)
# catb.fit(train, train_target)

# # Predict
# y_pred = catb.predict_proba(test)

In [53]:
# # Make file for submission
# exp7_submission = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/sample_submission.csv')
# exp7_submission['target'] = y_pred[:,1]
# exp7_submission.to_csv('plain_catboost_without_onehot_model.csv', index=False)

### Dimensional Reduction / Under Sampling
I found that current dataset 600k x 5769 features took a lot of memory
In this section I will try several methods for reducing memory burden by dimensionality reduction (PCA) or under-sample the data.

In [54]:
# # Number of positive labels on training data
# train_target.value_counts()

In [55]:
# # PCA for dimensional reduction

# # Lets build PCA that will retain 95% variance on the data
# pca = TruncatedSVD(n_components=100)
# pca.fit(train)

In [56]:
# #Plotting the Cumulative Summation of the Explained Variance
# plt.figure()
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('Number of Components')
# plt.ylabel('Variance (%)') #for each component
# plt.title('Data Explained Variance')
# plt.axhline(y=0.95, color = 'r', linestyle='--')
# plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize = 12)
# plt.show()

# # Based on the picture, after 70 components there are no significant increment of vairance % of total data
# # I decide to only use 70 components

In [57]:
# # Lets build PCA that will retain 95% variance on the data
# pca = TruncatedSVD(n_components=70, random_state = 41)
# pca.fit(train_w_onehot)

# train_reduced = pca.transform(train_w_onehot)
# test_reduced = pca.transform(test_w_onehot)

In [58]:
# # Define CatBoost model - without onehot encoding features
# catb = CatBoostClassifier(iterations = 100 , learning_rate = 0.3, loss_function= "Logloss", eval_metric = "AUC", verbose=False)
# catb.fit(train, train_target)

# # Predict
# y_pred = catb.predict_proba(test)

In [59]:
# # Make file for submission
# exp8_submission = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/sample_submission.csv')
# exp8_submission['target'] = y_pred[:,1]
# exp8_submission.to_csv('plain_catboost_reduced_model.csv', index=False)