### Submission 1

- Baseline
    - Missing data is filled with outlier values like 'missing', 99 etc
    - Label Encoding of all the categorical variables
    - LGBM
- Redefined the orders of ord_1, ord_2

In [1]:
import sys

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

sys.path.insert(0, "/home/jupyter/kaggle/cat_in_dat_2_git/cat_in_dat_2/src")
import utility

DATA_DIR = '/home/jupyter/kaggle/cat_in_dat_2_git/cat_in_dat_2/data/read_only'
SEED = 42


utility.set_seed(SEED)

#Read the data file
train, test, submission = utility.read_files(DATA_DIR, index_col='id')

combined_df = pd.concat([train.drop('target', axis=1), test])
print(f'Shape of the combined DF {combined_df.shape}')

train_index = train.shape[0]
train_Y = train.target

# Fill the missing values
nom_features = utility.get_fetaure_names(train, 'nom')
print(f'Number of nominal features {len(nom_features)}')
print(f'Nominal Features : {nom_features}')

binary_features = utility.get_fetaure_names(train, 'bin')
print(f'Number of binary features {len(binary_features)}')
print(f'Binary Features : {binary_features}')

ordinal_fetaures = utility.get_fetaure_names(train, 'ord')
print(f'Number of ordinal features {len(ordinal_fetaures)}')
print(f'Ordinal Features : {ordinal_fetaures}')

#Filling missing values
combined_df[['bin_3', 'bin_4']] = combined_df[['bin_3', 'bin_4']].fillna('missing_binary')
combined_df[['bin_0', 'bin_1', 'bin_2']] = combined_df[['bin_0', 'bin_1', 'bin_2']].fillna(-1)

# Filling nominal variables with missing values
combined_df[nom_features] = combined_df[nom_features].fillna('missing_nom')

# ord_0 has apparently value fo type integer. 
combined_df['ord_0'] = combined_df['ord_0'].fillna(999)

# Fill missing values for other ordinal values
combined_df[['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']] = combined_df[['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']].fillna('missing_ord')

combined_df['day'] = combined_df['day'].fillna(999) 
combined_df['month'] = combined_df['month'].fillna(999)

# List to maintain names
new_features = []
features_to_removed = []

# For  ord_1, ord_2 we can decide on the order based on names
cat_type_ord_1 = pd.CategoricalDtype(categories=['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster', 'missing_ord'])
combined_df['ord_1_cat'] = combined_df['ord_1'].astype(cat_type_ord_1)

cat_type_ord_2 = pd.CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot', 'missing_ord'])
combined_df['ord_2_cat'] = combined_df['ord_2'].astype(cat_type_ord_2)

new_features = new_features + ['ord_1_cat', 'ord_2_cat']
features_to_removed = features_to_removed + ['ord_1', 'ord_2']

# Convert rest of the ordinal features in categories 
for feature_name in ['ord_0', 'ord_3', 'ord_4', 'ord_5']:
    print(f'Converting {feature_name} in ordered categorical')
    combined_df[feature_name + '_cat'] = pd.Categorical(combined_df[feature_name], ordered=True)
    new_features = new_features + [feature_name + '_cat']
    features_to_removed = features_to_removed + [feature_name]

# Print the order of the ordinal features
for name in utility.get_fetaure_names(combined_df, '_cat'):
    print(f'Categories for feature {name} : {combined_df[name].cat.categories}')

print(f'List of new_features : {new_features}')
print(f'List of features_to_removed : {features_to_removed}')

feature_list = [name for name in combined_df.select_dtypes(['object', 'float64']) if name not in features_to_removed]
# Print rest of the variables into categorical
for feature_name in feature_list:
    print(f'Converting {feature_name} in categorical')
    combined_df[feature_name + '_cat'] = pd.Categorical(combined_df[feature_name])
    new_features = new_features + [feature_name + '_cat']
    features_to_removed = features_to_removed + [feature_name]

# Keep a copy of the original DF
combined_df_org = combined_df.copy(deep=True)

# remove the features not needed
combined_df = combined_df.drop(features_to_removed, axis=1)

for name in combined_df.columns:
    lb = LabelEncoder()
    print(name)
    combined_df[name] = lb.fit_transform(combined_df[name])

train_X = combined_df[:train_index]
test_X = combined_df[train_index:]

print(f"train_X : {train_X.shape}")
print(f"test_X : {test_X.shape}")
print(f"train_Y : {train_Y.shape}")

Loading Data...
Shape of train.csv : (600000, 24)
Shape of test.csv : (400000, 23)
Shape of sample_submission.csv : (400000, 2)
Data Loaded...
Shape of the combined DF (1000000, 23)
Number of nominal features 10
Nominal Features : ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
Number of binary features 5
Binary Features : ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
Number of ordinal features 6
Ordinal Features : ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
Converting ord_0 in ordered categorical
Converting ord_3 in ordered categorical
Converting ord_4 in ordered categorical
Converting ord_5 in ordered categorical
Categories for feature ord_1_cat : Index(['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster',
       'missing_ord'],
      dtype='object')
Categories for feature ord_2_cat : Index(['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot',
       'missing_ord'],
      dtype='object')
Categories for feature ord_

## LGBM + Strartified 5 folds + 10000 trees + 100 early stopping

In [9]:
lgb_params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'metric':'auc',
    'n_jobs':-1,
    'verbose':-1,
    'seed': SEED,
    'num_trees':10000,
    'early_stopping_rounds':100,
    }
kf = StratifiedKFold(n_splits=2, random_state=SEED, shuffle=True)
result_dict = utility.make_prediction_classification(train_X, train_Y, test_X, params=lgb_params, seed=SEED, kf=kf)

fold 1 of 2



Found `num_trees` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.755966	valid_1's auc: 0.746241
[100]	training's auc: 0.770553	valid_1's auc: 0.756125
[150]	training's auc: 0.781786	valid_1's auc: 0.762536
[200]	training's auc: 0.790165	valid_1's auc: 0.765574
[250]	training's auc: 0.79703	valid_1's auc: 0.766863
[300]	training's auc: 0.803277	valid_1's auc: 0.767579
[350]	training's auc: 0.808998	valid_1's auc: 0.767804
[400]	training's auc: 0.814548	valid_1's auc: 0.768051
[450]	training's auc: 0.820015	valid_1's auc: 0.768129
[500]	training's auc: 0.825155	valid_1's auc: 0.768075
Early stopping, best iteration is:
[430]	training's auc: 0.817969	valid_1's auc: 0.768216
CV OOF Score for fold 1 is 0.7682160504595583
fold 2 of 2



Found `num_trees` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.754506	valid_1's auc: 0.747014
[100]	training's auc: 0.76951	valid_1's auc: 0.757182
[150]	training's auc: 0.780845	valid_1's auc: 0.763563
[200]	training's auc: 0.788582	valid_1's auc: 0.765643
[250]	training's auc: 0.795971	valid_1's auc: 0.767566
[300]	training's auc: 0.802198	valid_1's auc: 0.768448
[350]	training's auc: 0.807759	valid_1's auc: 0.768549
[400]	training's auc: 0.813221	valid_1's auc: 0.768698
[450]	training's auc: 0.818284	valid_1's auc: 0.768884
[500]	training's auc: 0.823478	valid_1's auc: 0.768906
[550]	training's auc: 0.828068	valid_1's auc: 0.768885
[600]	training's auc: 0.832659	valid_1's auc: 0.768824
Early stopping, best iteration is:
[516]	training's auc: 0.824928	valid_1's auc: 0.768948
CV OOF Score for fold 2 is 0.7689479975701903
Combined OOF score : 0.76858
Average of 2 folds OOF score 0.76858
std of 2 folds OOF score 0.00037


## LGBM + Strartified 5 folds + Default parameters

In [12]:
lgb_params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'metric':'auc',
    'n_jobs':-1,
    'verbose':-1,
    'seed': SEED
    }
kf = StratifiedKFold(n_splits=2, random_state=SEED, shuffle=True)
result_dict = utility.make_prediction_classification(train_X, train_Y, test_X, params=lgb_params, seed=SEED, kf=kf)

fold 1 of 2
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.755966	valid_1's auc: 0.746241
[100]	training's auc: 0.770553	valid_1's auc: 0.756125
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.770553	valid_1's auc: 0.756125
CV OOF Score for fold 1 is 0.7561249769434389
fold 2 of 2
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.754506	valid_1's auc: 0.747014
[100]	training's auc: 0.76951	valid_1's auc: 0.757182
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.76951	valid_1's auc: 0.757182
CV OOF Score for fold 2 is 0.7571819662803558
Combined OOF score : 0.75665
Average of 2 folds OOF score 0.75665
std of 2 folds OOF score 0.00053


In [5]:
xgb_params = {
    #'verbosity':0,
    'validate_parameters' : True,
    'objective' : 'binary:logistic',
    'eval_metric' : 'auc',
    'seed' : SEED,
    }

In [6]:
kf = StratifiedKFold(n_splits=2, random_state=SEED, shuffle=True)
result_dict = utility.make_prediction_classification(train_X, train_Y, test_X, params=xgb_params, kf=kf,
                                                     n_estimators=10000, early_stopping_rounds=100, seed=SEED, model_type='xgb')

fold 1 of 2
[0]	train-auc:0.69237	valid_data-auc:0.686583
Multiple eval metrics have been passed: 'valid_data-auc' will be used for early stopping.

Will train until valid_data-auc hasn't improved in 100 rounds.
[50]	train-auc:0.78963	valid_data-auc:0.756961
[100]	train-auc:0.817287	valid_data-auc:0.763715
[150]	train-auc:0.836187	valid_data-auc:0.763296
[200]	train-auc:0.852295	valid_data-auc:0.762366
Stopping. Best iteration:
[100]	train-auc:0.817287	valid_data-auc:0.763715

CV OOF Score for fold 1 is 0.7637146210251775
fold 2 of 2
[0]	train-auc:0.69198	valid_data-auc:0.688494
Multiple eval metrics have been passed: 'valid_data-auc' will be used for early stopping.

Will train until valid_data-auc hasn't improved in 100 rounds.
[50]	train-auc:0.789042	valid_data-auc:0.758757
[100]	train-auc:0.815862	valid_data-auc:0.763954
[150]	train-auc:0.835926	valid_data-auc:0.764582
[200]	train-auc:0.851866	valid_data-auc:0.763411
Stopping. Best iteration:
[125]	train-auc:0.826297	valid_data-auc

In [6]:
result_dict

{'yoof': array([0.04197939, 0.20946288, 0.23120883, ..., 0.13139835, 0.13190025,
        0.09599186]),
 'prediction': array([0.25661213, 0.24548976, 0.15422952, ..., 0.42731411, 0.23766207,
        0.17423461]),
 'oof_score': 0.76857,
 'cv_scores': [0.7679927999693894,
  0.7677700731649933,
  0.7657897048741575,
  0.7696576356383572,
  0.7717509583545654],
 'avg_cv_scores': 0.76859,
 'std_cv_scores': 0.002}

In [9]:
cat_params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'n_estimators' : 1000,
    'random_seed' : SEED,
    'early_stopping_rounds' : 100,
    'metric_period' : 100
    }

In [10]:
kf = StratifiedKFold(n_splits=2, random_state=SEED, shuffle=True)
result_dict = utility.make_prediction_classification(train_X, train_Y, test_X, params=cat_params, kf=kf,
                                                     seed=SEED, model_type='cat')

fold 1 of 2
Learning rate set to 0.084168
0:	total: 38.2ms	remaining: 38.1s
100:	total: 3.59s	remaining: 31.9s
200:	total: 7.14s	remaining: 28.4s
300:	total: 10.7s	remaining: 24.9s
400:	total: 14.4s	remaining: 21.5s
500:	total: 18s	remaining: 17.9s
600:	total: 21.6s	remaining: 14.4s
700:	total: 25.3s	remaining: 10.8s
800:	total: 29s	remaining: 7.19s
900:	total: 32.6s	remaining: 3.58s
999:	total: 36.2s	remaining: 0us




CV OOF Score for fold 1 is 0.7688540564462246
fold 2 of 2
Learning rate set to 0.084169
0:	total: 41.7ms	remaining: 41.7s
100:	total: 3.75s	remaining: 33.4s
200:	total: 7.43s	remaining: 29.5s
300:	total: 11s	remaining: 25.7s
400:	total: 14.7s	remaining: 21.9s
500:	total: 18.5s	remaining: 18.4s
600:	total: 22.3s	remaining: 14.8s
700:	total: 26.3s	remaining: 11.2s
800:	total: 30s	remaining: 7.45s
900:	total: 33.7s	remaining: 3.7s
999:	total: 37.3s	remaining: 0us




CV OOF Score for fold 2 is 0.7696758818752295
Combined OOF score : 0.76926
Average of 2 folds OOF score 0.76926
std of 2 folds OOF score 0.00041


In [11]:
result_dict

{'yoof': array([-2.04827069, -2.18061238, -0.841312  , ..., -1.86141851,
        -1.50868357, -2.31627204]),
 'prediction': array([-0.99878393, -1.11217076, -1.8411364 , ..., -0.62456026,
        -1.25059632, -1.335044  ]),
 'oof_score': 0.76926,
 'cv_scores': [0.7688540564462246, 0.7696758818752295],
 'avg_cv_scores': 0.76926,
 'std_cv_scores': 0.00041}

In [None]:
submission.head()

In [None]:
submission.target = result_dict['prediction']
submission.to_csv('submission_1.csv', index=False)

In [None]:
submission.head()

In [None]:
# ! kaggle competitions submit -c cat-in-the-dat -f submission_1.csv -m "Baseline solutions"