## This is my submission in the Kaggle "ICR - Identifying Age-Related Conditions". It scored in the top 3% out of 6000+ competitors allowing me to receive my first silver medal on Kaggle
#### This Notebook was originally copied from kaggle user "ANSHUMAN MISHRA" and was changed with other fine-tuned model hyperparameters, different Ensemble formula, and more importantly, feature engineering.
#### This competition proves that feature engineering is potentially much more crucial and effective than model fine-tuning. It's also one of the competitions where the features are not defined, so we don't really know what they stand for.
#### 
#### A lot of the work was done in different notebooks and this is only the final clean result


In [1]:
# import optuna
# from optuna import Trial, visualization

import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, log_loss
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from ydata_profiling import ProfileReport 




In [2]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

In [3]:
train

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0
615,fe1942975e40,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0


### the train and test sets have more than 50 features so it is useful to reduce that number
### To do that, I tried removing highly correlated features as well as PCA.
### But in the end, I used the feature_importance_ variable of different Ensemble models (Random Forest, XGBoost, Catboost) to remove some of the less relevant features.(see below the training of the models)

In [4]:
# these are the chosen features
FEATURES = ['AB', 'AF',  'AM',  'BC',  'BN',
       'BD ', 'BQ', 'BR',  'CB', 'CC', 'CD ',  'CH',  'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU',  'DY',
       'EB', 'EE',  'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS',  'GF', 'GH',  'GL']
TARGET = 'Class'

In [5]:
#dropping nan values
train = train.dropna()

In [6]:
x = train[FEATURES]
y = train[TARGET]
x

Unnamed: 0,AB,AF,AM,BC,BN,BD,BQ,BR,CB,CC,...,FC,FD,FE,FI,FL,FR,FS,GF,GH,GL
0,0.209377,3109.03329,22.394407,5.555634,22.5984,4126.58731,152.707705,823.928241,47.223358,0.563481,...,13.394640,10.265073,9028.291921,3.583450,7.298162,1.73855,0.094822,2003.810319,22.136229,0.120343
1,0.145282,978.76416,36.968889,1.229900,19.4205,5496.92824,14.754720,51.216883,30.284345,0.484710,...,17.175984,0.296850,6785.003474,10.358927,0.173229,0.49706,0.568932,27981.562750,29.135430,21.978000
2,0.470030,2635.10654,32.360553,1.229900,26.4825,5135.78024,219.320160,482.141594,32.563713,0.495852,...,224.207424,8.745201,8338.906181,11.626917,7.709560,0.97556,1.198821,13676.957810,28.022851,0.196941
3,0.252107,3819.65177,77.112203,1.229900,23.6577,4169.67738,11.050410,661.518640,15.201914,0.717882,...,59.301984,7.884336,10965.766040,14.852022,6.122162,0.49706,0.284466,2094.262452,39.948656,0.155829
4,0.380297,3733.04844,14.103738,102.151980,24.0108,5728.73412,149.717165,6074.859475,82.213495,0.536467,...,29.102640,4.274640,16198.049590,13.666727,8.153058,48.50134,0.121914,8524.370502,45.381316,0.096614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,0.175193,2607.26686,7.067354,1.229900,21.5391,6757.45461,33.122575,512.065997,29.852368,1.317070,...,81.087552,0.296850,3324.847012,9.256327,0.173229,1.89486,1.395238,10960.364830,38.380254,21.978000
612,0.149555,3130.05946,9.513984,2.804172,21.1860,4157.68439,27.287375,365.516874,41.368691,0.691257,...,29.708112,0.296850,17167.209610,9.879296,0.173229,1.26092,0.067730,8095.932828,24.640462,21.978000
613,0.435846,5462.03438,46.551007,3.777550,27.1887,5654.07556,344.644105,505.006814,61.910576,0.772304,...,69.343680,6.067614,18460.330020,10.910227,10.223150,1.24236,0.426699,3085.308063,29.648928,0.145340
614,0.427300,2459.10720,55.355778,1.229900,20.4798,5888.87769,103.988995,2083.880500,90.411867,0.708616,...,71.725584,0.296850,5088.922912,12.029366,0.173229,0.49706,0.067730,6474.652866,26.166072,21.978000


#### As you can see the columns went from 58 to 41

#### Next we use RobustScaler as opposed to StandardScaler to eliminate potential outliers since we don't really understand the features and their values

In [7]:
from sklearn.preprocessing import RobustScaler


scaler = RobustScaler()


scaler.fit(x)


x = scaler.transform(x)
test[FEATURES] = scaler.transform(test[FEATURES])

x= pd.DataFrame(x, columns =FEATURES)
x

Unnamed: 0,AB,AF,AM,BC,BN,BD,BQ,BR,CB,CC,...,FC,FD,FE,FI,FL,FR,FS,GF,GH,GL
0,-0.433566,0.008599,0.090179,1.039696,0.166667,-0.465796,0.870047,0.407030,0.047140,-0.460326,...,-0.696769,1.818583,0.273549,-2.128176,0.875645,0.608834,-0.342007,-0.365319,-0.711174,-0.021700
1,-0.643357,-0.973618,0.671920,0.000000,-0.583333,0.271107,-0.430756,-1.056953,-0.258233,-0.833123,...,-0.575751,-0.296693,-0.135725,0.122225,-0.336647,-0.693074,0.698885,1.038892,-0.117424,0.979662
2,0.419580,-0.209918,0.487978,0.000000,1.083333,0.076899,1.498158,-0.240521,-0.217141,-0.780394,...,6.050089,1.496063,0.147775,0.543374,0.945643,-0.191287,2.081784,0.265666,-0.211806,-0.018190
3,-0.293706,0.336248,2.274244,0.000000,0.416667,-0.442624,-0.465685,0.099328,-0.530135,0.270409,...,0.772451,1.313386,0.627029,1.614557,0.675551,-0.693074,0.074349,-0.360429,0.799874,-0.020074
4,0.125874,0.296318,-0.240744,24.256757,0.500000,0.395761,0.841849,10.355469,0.677934,-0.588174,...,-0.194050,0.547402,1.581628,1.220874,1.021103,49.647381,-0.282528,-0.012854,1.260732,-0.022787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,-0.545455,-0.222754,-0.521602,0.000000,-0.083333,0.948957,-0.257559,-0.183826,-0.266020,3.106182,...,1.469677,-0.296693,-0.767010,-0.243992,-0.336647,0.772751,2.513011,0.118822,0.666824,0.979662
544,-0.629371,0.018294,-0.423945,0.378378,-0.166667,-0.449074,-0.312581,-0.461479,-0.058407,0.144401,...,-0.174672,-0.296693,1.758445,-0.037079,-0.336647,0.107960,-0.401487,-0.036013,-0.498737,0.979662
545,0.307692,1.093512,1.054391,0.612331,1.250000,0.355613,2.679878,-0.197200,0.311917,0.527972,...,1.093826,0.927874,1.994367,0.305333,1.373325,0.088497,0.386617,-0.306859,-0.073864,-0.020554
546,0.279720,-0.291067,1.405835,0.000000,-0.333333,0.481878,0.410663,2.794141,0.825732,0.226557,...,1.170056,-0.296693,-0.445165,0.677043,-0.336647,-0.693074,-0.401487,-0.123650,-0.369318,0.979662


In [12]:
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2, stratify = y,random_state=98)

## XGBoost 

In [13]:
xgb_params = {'max_depth': 1, 'learning_rate': 0.5070401070414171, 'early_stopping_rounds': 196, 'subsample': 0.5500624184931647, 'colsample_bytree': 0.05509292726952148, 'random_state': 42}

xgb_clf = xgb.XGBClassifier(**(xgb_params))
xgb_clf.fit(x_train,y_train,eval_set=[(x_train,y_train),(x_validation,y_validation)],verbose=200)

log_loss(y_validation,[el[1] for el in xgb_clf.predict_proba(x_validation)],)

[0]	validation_0-logloss:0.54885	validation_1-logloss:0.53698
[99]	validation_0-logloss:0.07697	validation_1-logloss:0.23732


0.23128684689474696

In [14]:
v = pd.DataFrame(xgb_clf.feature_importances_)
f = pd.DataFrame(x.columns,columns = ['ha'])
dd =  pd.concat([v , f],axis=1)
dd.sort_values(0)

Unnamed: 0,0,ha
29,0.0,EP
15,0.004238,CW
7,0.007985,BR
5,0.012595,BD
32,0.013515,FD
39,0.013835,GH
19,0.013993,DH
11,0.014148,CH
21,0.014191,DL
17,0.014262,DE


## LGBM

In [15]:
import lightgbm as lgb

lgb_params = {'max_depth': 1, 'learning_rate': 0.4637612835812256, 'early_stopping_rounds': 112, 'subsample': 0.1411919766283556, 'colsample_bytree': 0.477020689612885, "is_unbalance": True}

lgb_clf = lgb.LGBMClassifier(**(lgb_params))
lgb_clf.fit(x_train,y_train,eval_set=[(x_train,y_train),(x_validation,y_validation)],verbose=200)

log_loss(y_validation,[el[1] for el in lgb_clf.predict_proba(x_validation)],)





0.2342892362984006

In [16]:
v = pd.DataFrame(lgb_clf.feature_importances_)
f = pd.DataFrame(x.columns,columns = ['ha'])
dd =  pd.concat([v , f],axis=1)
dd.sort_values(0)

Unnamed: 0,0,ha
27,0,EH
13,0,CS
32,0,FD
33,0,FE
5,0,BD
17,0,DE
4,0,BN
7,1,BR
15,1,CW
37,1,FS


## CatBoost

In [17]:
import catboost as cb

cb_params={'iterations':10000, 'learning_rate':0.005, 'early_stopping_rounds':1000, 'auto_class_weights':'Balanced', 'loss_function':'MultiClass',
    'eval_metric':'MultiClass:use_weights=False', 'random_seed':42, 'use_best_model':True, 'l2_leaf_reg':1, 'max_ctr_complexity':15, 'max_depth':10,
    "grow_policy":'Lossguide', 'max_leaves':64, "min_data_in_leaf":40,
}

cb_clf = cb.CatBoostClassifier(**(cb_params))
cb_clf.fit(x_train,y_train,eval_set=[(x_train,y_train),(x_validation,y_validation)],verbose=200)

log_loss(y_validation,[el[1] for el in cb_clf.predict_proba(x_validation)],)

0:	learn: 0.6909868	test: 0.6909868	test1: 0.6911565	best: 0.6911565 (0)	total: 66.2ms	remaining: 11m 1s
200:	learn: 0.4147670	test: 0.4147670	test1: 0.4349457	best: 0.4349457 (200)	total: 1.72s	remaining: 1m 23s
400:	learn: 0.2834346	test: 0.2834346	test1: 0.3208772	best: 0.3208772 (400)	total: 3.39s	remaining: 1m 21s
600:	learn: 0.2055263	test: 0.2055263	test1: 0.2585510	best: 0.2585510 (600)	total: 5.06s	remaining: 1m 19s
800:	learn: 0.1534165	test: 0.1534165	test1: 0.2193222	best: 0.2193222 (800)	total: 6.74s	remaining: 1m 17s
1000:	learn: 0.1158275	test: 0.1158275	test1: 0.1918316	best: 0.1918316 (1000)	total: 8.48s	remaining: 1m 16s
1200:	learn: 0.0877097	test: 0.0877097	test1: 0.1730318	best: 0.1730318 (1200)	total: 10.3s	remaining: 1m 15s
1400:	learn: 0.0657034	test: 0.0657034	test1: 0.1583651	best: 0.1583651 (1400)	total: 12s	remaining: 1m 13s
1600:	learn: 0.0488769	test: 0.0488769	test1: 0.1466294	best: 0.1466294 (1600)	total: 14s	remaining: 1m 13s
1800:	learn: 0.0365482	test

0.11749293914661543

In [18]:
v = pd.DataFrame(cb_clf.feature_importances_)
f = pd.DataFrame(x.columns,columns = ['ha'])
dd =  pd.concat([v , f],axis=1)
dd.sort_values(0)

Unnamed: 0,0,ha
5,0.416047,BD
37,0.519266,FS
15,0.547981,CW
4,0.644223,BN
33,0.734695,FE
7,0.774073,BR
31,0.817257,FC
2,0.82681,AM
18,0.859866,DF
13,0.927805,CS


### Here we take the predictions of catboost and xgboost and combine them. I didn't include lgb because it didn't give good results in the public 

### leaderboard (which is definitely not something you should rely on) and was perhaps a mistake. One thing that I learned from another competition is:
### "always trust your cross validation and not the public leaderboard".

In [19]:
xgb_predictions = xgb_clf.predict_proba(test[FEATURES])
lgb_predictions = lgb_clf.predict_proba(test[FEATURES])
cb_predictions  = cb_clf.predict_proba(test[FEATURES])

predictions = 0.7 * cb_predictions + 0.3 * xgb_predictions 
predictions

array([[0.51826254, 0.48173747],
       [0.51826254, 0.48173747],
       [0.51826254, 0.48173747],
       [0.51826254, 0.48173747],
       [0.51826254, 0.48173747]])

In [20]:
sample_submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
sample_submission[['class_0','class_1']] = predictions
sample_submission.to_csv('submission.csv',index=False)
sample_submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.518263,0.481737
1,010ebe33f668,0.518263,0.481737
2,02fa521e1838,0.518263,0.481737
3,040e15f562a2,0.518263,0.481737
4,046e85c7cc7f,0.518263,0.481737
