### Submission 1

- Baseline
    - Missing data is filled with outlier values like 'missing', 99 etc
    - Label Encoding of all the categorical variables
    - LGBM
- Redefined the orders of ord_1, ord_2

In [2]:
import sys

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

sys.path.insert(0, "/home/jupyter/kaggle/cat_in_dat_2/kaggle_cat_in_dat_2/src")
import utility

DATA_DIR = '/home/jupyter/kaggle/cat_in_dat_2/kaggle_cat_in_dat_2/data/read_only'
SEED = 42


utility.set_seed(SEED)

#Read the data file
train, test, submission = utility.read_files(DATA_DIR, index_col='id')

combined_df = pd.concat([train.drop('target', axis=1), test])
print(f'Shape of the combined DF {combined_df.shape}')

train_index = train.shape[0]
train_Y = train.target

# Fill the missing values
nom_features = utility.get_fetaure_names(train, 'nom')
print(f'Number of nominal features {len(nom_features)}')
print(f'Nominal Features : {nom_features}')

binary_features = utility.get_fetaure_names(train, 'bin')
print(f'Number of binary features {len(binary_features)}')
print(f'Binary Features : {binary_features}')

ordinal_fetaures = utility.get_fetaure_names(train, 'ord')
print(f'Number of ordinal features {len(ordinal_fetaures)}')
print(f'Ordinal Features : {ordinal_fetaures}')

#Filling missing values
combined_df[['bin_3', 'bin_4']] = combined_df[['bin_3', 'bin_4']].fillna('missing_binary')
combined_df[['bin_0', 'bin_1', 'bin_2']] = combined_df[['bin_0', 'bin_1', 'bin_2']].fillna(-1)

# Filling nominal variables with missing values
combined_df[nom_features] = combined_df[nom_features].fillna('missing_nom')

# ord_0 has apparently value fo type integer. 
combined_df['ord_0'] = combined_df['ord_0'].fillna(999)

# Fill missing values for other ordinal values
combined_df[['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']] = combined_df[['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']].fillna('missing_ord')

combined_df['day'] = combined_df['day'].fillna(999) 
combined_df['month'] = combined_df['month'].fillna(999)

# List to maintain names
new_features = []
features_to_removed = []

# For  ord_1, ord_2 we can decide on the order based on names
cat_type_ord_1 = pd.CategoricalDtype(categories=['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster', 'missing_ord'])
combined_df['ord_1_cat'] = combined_df['ord_1'].astype(cat_type_ord_1)

cat_type_ord_2 = pd.CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot', 'missing_ord'])
combined_df['ord_2_cat'] = combined_df['ord_2'].astype(cat_type_ord_2)

new_features = new_features + ['ord_1_cat', 'ord_2_cat']
features_to_removed = features_to_removed + ['ord_1', 'ord_2']

# Convert rest of the ordinal features in categories 
for feature_name in ['ord_0', 'ord_3', 'ord_4', 'ord_5']:
    print(f'Converting {feature_name} in ordered categorical')
    combined_df[feature_name + '_cat'] = pd.Categorical(combined_df[feature_name], ordered=True)
    new_features = new_features + [feature_name + '_cat']
    features_to_removed = features_to_removed + [feature_name]

# Print the order of the ordinal features
for name in utility.get_fetaure_names(combined_df, '_cat'):
    print(f'Categories for feature {name} : {combined_df[name].cat.categories}')

print(f'List of new_features : {new_features}')
print(f'List of features_to_removed : {features_to_removed}')

feature_list = [name for name in combined_df.select_dtypes(['object', 'float64']) if name not in features_to_removed]
# Print rest of the variables into categorical
for feature_name in feature_list:
    print(f'Converting {feature_name} in categorical')
    combined_df[feature_name + '_cat'] = pd.Categorical(combined_df[feature_name])
    new_features = new_features + [feature_name + '_cat']
    features_to_removed = features_to_removed + [feature_name]

# Keep a copy of the original DF
combined_df_org = combined_df.copy(deep=True)

# remove the features not needed
combined_df = combined_df.drop(features_to_removed, axis=1)

for name in combined_df.columns:
    lb = LabelEncoder()
    print(name)
    combined_df[name] = lb.fit_transform(combined_df[name])

train_X = combined_df[:train_index]
test_X = combined_df[train_index:]

print(f"train_X : {train_X.shape}")
print(f"test_X : {test_X.shape}")
print(f"train_Y : {train_Y.shape}")

Loading Data...
Shape of train.csv : (600000, 24)
Shape of test.csv : (400000, 23)
Shape of sample_submission.csv : (400000, 2)
Data Loaded...
Shape of the combined DF (1000000, 23)
Number of nominal features 10
Nominal Features : ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
Number of binary features 5
Binary Features : ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
Number of ordinal features 6
Ordinal Features : ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
Converting ord_0 in ordered categorical
Converting ord_3 in ordered categorical
Converting ord_4 in ordered categorical
Converting ord_5 in ordered categorical
Categories for feature ord_1_cat : Index(['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster',
       'missing_ord'],
      dtype='object')
Categories for feature ord_2_cat : Index(['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot',
       'missing_ord'],
      dtype='object')
Categories for feature ord_

## LGBM + Strartified 5 folds + 10000 trees + 100 early stopping

In [11]:
lgb_params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'metric':'auc',
    'n_jobs':-1,
    'verbose':-1,
    'seed': SEED,
    'num_trees':10000,
    'early_stopping_rounds':100,
    }

result_dict = utility.make_prediction(train_X, train_Y, test_X, params=lgb_params, n_splits=5, seed=SEED)

fold 1 of 5



Found `num_trees` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.753004	valid_1's auc: 0.747302
[100]	training's auc: 0.766619	valid_1's auc: 0.758266
[150]	training's auc: 0.77555	valid_1's auc: 0.764179
[200]	training's auc: 0.782059	valid_1's auc: 0.767375
[250]	training's auc: 0.786991	valid_1's auc: 0.768664
[300]	training's auc: 0.791405	valid_1's auc: 0.769468
[350]	training's auc: 0.795684	valid_1's auc: 0.770301
[400]	training's auc: 0.799561	valid_1's auc: 0.770516
[450]	training's auc: 0.803401	valid_1's auc: 0.770871
[500]	training's auc: 0.807047	valid_1's auc: 0.770798
Early stopping, best iteration is:
[442]	training's auc: 0.802829	valid_1's auc: 0.770956
CV OOF Score for fold 1 is 0.7709556880146894
fold 2 of 5



Found `num_trees` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.753019	valid_1's auc: 0.747319
[100]	training's auc: 0.76679	valid_1's auc: 0.758293
[150]	training's auc: 0.775758	valid_1's auc: 0.764581
[200]	training's auc: 0.781985	valid_1's auc: 0.767374
[250]	training's auc: 0.787119	valid_1's auc: 0.769014
[300]	training's auc: 0.791637	valid_1's auc: 0.770131
[350]	training's auc: 0.795777	valid_1's auc: 0.770702
[400]	training's auc: 0.799699	valid_1's auc: 0.771008
[450]	training's auc: 0.803523	valid_1's auc: 0.77137
[500]	training's auc: 0.807085	valid_1's auc: 0.771392
[550]	training's auc: 0.810539	valid_1's auc: 0.771702
[600]	training's auc: 0.813892	valid_1's auc: 0.771739
[650]	training's auc: 0.817202	valid_1's auc: 0.77191
[700]	training's auc: 0.820373	valid_1's auc: 0.771708
[750]	training's auc: 0.823472	valid_1's auc: 0.7716
Early stopping, best iteration is:
[659]	training's auc: 0.817806	valid_1's auc: 0.771914
CV OOF Score for fold 2 is 0.


Found `num_trees` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.753987	valid_1's auc: 0.742968
[100]	training's auc: 0.768188	valid_1's auc: 0.754752
[150]	training's auc: 0.776234	valid_1's auc: 0.759957
[200]	training's auc: 0.782656	valid_1's auc: 0.763391
[250]	training's auc: 0.78788	valid_1's auc: 0.76532
[300]	training's auc: 0.792245	valid_1's auc: 0.766316
[350]	training's auc: 0.796463	valid_1's auc: 0.767047
[400]	training's auc: 0.800371	valid_1's auc: 0.767631
[450]	training's auc: 0.804018	valid_1's auc: 0.76778
[500]	training's auc: 0.807506	valid_1's auc: 0.767805
[550]	training's auc: 0.810972	valid_1's auc: 0.767974
[600]	training's auc: 0.81438	valid_1's auc: 0.768046
[650]	training's auc: 0.817621	valid_1's auc: 0.767917
Early stopping, best iteration is:
[582]	training's auc: 0.813151	valid_1's auc: 0.768093
CV OOF Score for fold 3 is 0.7680927742203882
fold 4 of 5



Found `num_trees` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.752762	valid_1's auc: 0.748203
[100]	training's auc: 0.766529	valid_1's auc: 0.759309
[150]	training's auc: 0.774808	valid_1's auc: 0.764436
[200]	training's auc: 0.781231	valid_1's auc: 0.767633
[250]	training's auc: 0.786324	valid_1's auc: 0.769275
[300]	training's auc: 0.791118	valid_1's auc: 0.770584
[350]	training's auc: 0.795053	valid_1's auc: 0.770992
[400]	training's auc: 0.79885	valid_1's auc: 0.771279
[450]	training's auc: 0.802546	valid_1's auc: 0.771615
[500]	training's auc: 0.806335	valid_1's auc: 0.772228
[550]	training's auc: 0.80994	valid_1's auc: 0.772234
[600]	training's auc: 0.813358	valid_1's auc: 0.772357
[650]	training's auc: 0.816685	valid_1's auc: 0.772367
[700]	training's auc: 0.819923	valid_1's auc: 0.772605
[750]	training's auc: 0.822866	valid_1's auc: 0.772564
[800]	training's auc: 0.825966	valid_1's auc: 0.772775
[850]	training's auc: 0.828964	valid_1's auc: 0.772853
[900]	


Found `num_trees` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.752427	valid_1's auc: 0.750744
[100]	training's auc: 0.766478	valid_1's auc: 0.762664
[150]	training's auc: 0.774765	valid_1's auc: 0.768034
[200]	training's auc: 0.781304	valid_1's auc: 0.771163
[250]	training's auc: 0.786344	valid_1's auc: 0.772673
[300]	training's auc: 0.790717	valid_1's auc: 0.773483
[350]	training's auc: 0.795029	valid_1's auc: 0.774255
[400]	training's auc: 0.798722	valid_1's auc: 0.774279
[450]	training's auc: 0.802342	valid_1's auc: 0.774592
[500]	training's auc: 0.806206	valid_1's auc: 0.774822
[550]	training's auc: 0.809512	valid_1's auc: 0.775002
[600]	training's auc: 0.812769	valid_1's auc: 0.775014
Early stopping, best iteration is:
[540]	training's auc: 0.80895	valid_1's auc: 0.775052
CV OOF Score for fold 5 is 0.7750523786515673
Combined OOF score : 0.77177
Average of 5 folds OOF score 0.77179
std of 5 folds OOF score 0.00229


## LGBM + Strartified 5 folds + Defualt parameters

In [3]:
lgb_params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'metric':'auc',
    'n_jobs':-1,
    'verbose':-1,
    'seed': SEED
    }

result_dict = utility.make_prediction(train_X, train_Y, test_X, params=lgb_params, n_splits=5, seed=SEED)

fold 1 of 5
[50]	training's auc: 0.753004	valid_1's auc: 0.747302
[100]	training's auc: 0.766619	valid_1's auc: 0.758266
CV OOF Score for fold 1 is 0.7582659115440875
fold 2 of 5
[50]	training's auc: 0.753019	valid_1's auc: 0.747319
[100]	training's auc: 0.76679	valid_1's auc: 0.758293
CV OOF Score for fold 2 is 0.7582928189220268
fold 3 of 5
[50]	training's auc: 0.753987	valid_1's auc: 0.742968
[100]	training's auc: 0.768188	valid_1's auc: 0.754752
CV OOF Score for fold 3 is 0.7547516721185684
fold 4 of 5
[50]	training's auc: 0.752762	valid_1's auc: 0.748203
[100]	training's auc: 0.766529	valid_1's auc: 0.759309
CV OOF Score for fold 4 is 0.7593093024755377
fold 5 of 5
[50]	training's auc: 0.752427	valid_1's auc: 0.750744
[100]	training's auc: 0.766478	valid_1's auc: 0.762664
CV OOF Score for fold 5 is 0.7626639300312533
Combined OOF score : 0.75865
Average of 5 folds OOF score 0.75866
std of 5 folds OOF score 0.00253


In [None]:
submission.head()

In [None]:
submission.target = result_dict['prediction']
submission.to_csv('submission_1.csv', index=False)

In [None]:
submission.head()

In [None]:
# ! kaggle competitions submit -c cat-in-the-dat -f submission_1.csv -m "Baseline solutions"