## <HR>CatBoost - Hyperparameter Tuning<HR>

### Import Required Packages

In [104]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [105]:
DATA_PATH = '/kaggle/input/final-upload/'

In [106]:
env_features = pd.read_csv(DATA_PATH+'env_features.csv', parse_dates=['date'])
geo_features = pd.read_csv(DATA_PATH+'geo_features_without_event_count.csv', parse_dates=['date'])
features = pd.merge(env_features, geo_features, on=['fire_id','date','fire_type','fire_type_name'])
features.rename({'longitude_x':'longitude','latitude_x':'latitude'}, inplace=True, axis=1)

In [107]:
drop_features = ['date','fire_type_name','fire_type_name_en'] + \
                ['fire_id','fire_type']

In [108]:
X,y = features.drop(drop_features, axis=1), features.fire_type
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=2020)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, shuffle=False, random_state=2020)

In [109]:
X_train.shape, X_val.shape, X_test.shape, 

((137710, 217), (19673, 217), (17488, 217))

In [110]:
def get_roc_auc(y_true, y_prob):
    n_classes = 11
    one_hot_y_true = np.zeros((y_true.size, n_classes))
    one_hot_y_true[np.arange(y_true.size),y_true-1] = 1
    fpr, tpr = {}, {}
    roc_auc = {'roc_auc_micro': roc_auc_score(one_hot_y_true, y_prob, average='micro', multi_class="ovr")}
    for i in range(n_classes):
        if one_hot_y_true[:,i].sum():
            roc_auc[i] = roc_auc_score(one_hot_y_true[:,i], y_prob[:,i], multi_class="ovr")
            fpr[i], tpr[i], _ = roc_curve(one_hot_y_true[:,i], y_prob[:,i])
        else:
            roc_auc[i] = 0.0
    return roc_auc, fpr, tpr

### Part-1 Hyperparameter Tuning

In [111]:
clf_1 = CatBoostClassifier(iterations=1000, loss_function='MultiClass',verbose=200, random_state=2020, task_type="GPU")

In [112]:
clf_1.fit(X_train, y_train, eval_set=(X_val, y_val))

Learning rate set to 0.172929
0:	learn: 1.9958096	test: 2.0524912	best: 2.0524912 (0)	total: 26ms	remaining: 25.9s
200:	learn: 0.9333750	test: 1.3651797	best: 1.3651797 (200)	total: 4.35s	remaining: 17.3s
400:	learn: 0.8495138	test: 1.3694879	best: 1.3624039 (358)	total: 8.35s	remaining: 12.5s
600:	learn: 0.7902835	test: 1.3832106	best: 1.3624039 (358)	total: 13.3s	remaining: 8.82s
800:	learn: 0.7457820	test: 1.3891897	best: 1.3624039 (358)	total: 18s	remaining: 4.48s
999:	learn: 0.7102187	test: 1.4016840	best: 1.3624039 (358)	total: 22.1s	remaining: 0us
bestTest = 1.362403917
bestIteration = 358
Shrink model to first 359 iterations.


<catboost.core.CatBoostClassifier at 0x7efe7326ea50>

In [113]:
n_classes = 11
roc_auc1 = get_roc_auc(y_train, clf_1.predict_proba(X_train))
roc_auc2 = get_roc_auc(y_val, clf_1.predict_proba(X_val))
roc_auc3 = get_roc_auc(y_test, clf_1.predict_proba(X_test))

columns_name = ['roc_auc_micro'] + ['roc_auc_fire_type_{0}'.format(i+1) for i in range(n_classes)]
pd.DataFrame([roc_auc1[0].values(),roc_auc2[0].values(),roc_auc3[0].values()], 
                 columns=columns_name, index=['train-set','val-set','test-set']).T

Unnamed: 0,train-set,val-set,test-set
roc_auc_micro,0.963917,0.9058,0.898534
roc_auc_fire_type_1,0.991565,0.945089,0.95517
roc_auc_fire_type_2,0.965493,0.707311,0.759817
roc_auc_fire_type_3,0.931656,0.820181,0.835392
roc_auc_fire_type_4,0.970005,0.952607,0.935064
roc_auc_fire_type_5,0.942605,0.826018,0.785085
roc_auc_fire_type_6,0.94258,0.870692,0.87176
roc_auc_fire_type_7,0.999866,0.0,0.0
roc_auc_fire_type_8,0.968092,0.914678,0.870703
roc_auc_fire_type_9,0.926463,0.830026,0.801574


In [114]:
res = list(zip(clf_1.feature_names_, clf_1.feature_importances_))
res.sort(key=lambda x:x[1], reverse=True)

In [116]:
#Top 15 features
res[:15]

[('num_cities_radius_10', 9.6663445287508),
 ('latitude', 6.377285563311372),
 ('district', 5.752218186757519),
 ('num_cities_radius_15', 5.343896224049576),
 ('longitude', 5.343653614152017),
 ('federal_subject', 4.846172675559235),
 ('weekofyear', 3.6913386668766273),
 ('latitude_y', 2.6451742856514975),
 ('num_cities_radius_5', 2.359616076705515),
 ('rhum_1w', 2.222251621821489),
 ('nature_forest_dst', 2.1992730378657863),
 ('city_dst', 2.0750689160210336),
 ('longitude_y', 2.039849727046975),
 ('field_dst', 1.7603709334785953),
 ('population', 1.74224500747662)]

### Part-2 Hyperparameter Tuning

In [117]:
drop_features = ['date','fire_type_name','fire_type_name_en'] + \
                 ['fire_id','fire_type', 'day'] + [col[0] for col in res[-5:]]

X,y = features.drop(drop_features, axis=1), features.fire_type
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=2020)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, shuffle=False, random_state=2020)

In [118]:
#set golden features for catboost model
golden_features = ['177:border_count=1024', '187:border_count=1024']
list(X_train.columns).index('population'), list(X_train.columns).index('weekofyear')

(187, 177)

In [119]:
clf_2 = CatBoostClassifier(iterations=6000, use_best_model=True, eval_metric='MultiClass',
                         od_type = 'IncToDec', l2_leaf_reg=5, 
                         per_float_feature_quantization = golden_features, 
                         loss_function='MultiClass',verbose=500, random_state=2020, 
                         task_type="GPU")

In [120]:
clf_2.fit(X_train, y_train, eval_set=(X_val, y_val))

0:	learn: 2.3231667	test: 2.3324828	best: 2.3324828 (0)	total: 26.3ms	remaining: 2m 37s
500:	learn: 1.0327174	test: 1.4330398	best: 1.4318100 (275)	total: 11.1s	remaining: 2m 1s
1000:	learn: 0.9555412	test: 1.4057537	best: 1.4057537 (1000)	total: 21.9s	remaining: 1m 49s
1500:	learn: 0.9088863	test: 1.3927797	best: 1.3924470 (1486)	total: 32.5s	remaining: 1m 37s
2000:	learn: 0.8747613	test: 1.3941936	best: 1.3901883 (1761)	total: 44.6s	remaining: 1m 29s
2500:	learn: 0.8471674	test: 1.3913012	best: 1.3884134 (2316)	total: 55.2s	remaining: 1m 17s
3000:	learn: 0.8229938	test: 1.3868652	best: 1.3858325 (2820)	total: 1m 5s	remaining: 1m 5s
3500:	learn: 0.8019535	test: 1.3884618	best: 1.3858325 (2820)	total: 1m 16s	remaining: 54.5s
4000:	learn: 0.7821892	test: 1.3873641	best: 1.3858325 (2820)	total: 1m 26s	remaining: 43.4s
4500:	learn: 0.7641330	test: 1.3902104	best: 1.3858325 (2820)	total: 1m 38s	remaining: 32.8s
5000:	learn: 0.7478714	test: 1.3895331	best: 1.3858325 (2820)	total: 1m 49s	rem

<catboost.core.CatBoostClassifier at 0x7efe762a3e90>

In [121]:
n_classes = 11
roc_auc1 = get_roc_auc(y_train, clf_2.predict_proba(X_train))
roc_auc2 = get_roc_auc(y_val, clf_2.predict_proba(X_val))
roc_auc3 = get_roc_auc(y_test, clf_2.predict_proba(X_test))

columns_name = ['roc_auc_micro'] + ['roc_auc_fire_type_{0}'.format(i+1) for i in range(n_classes)]
pd.DataFrame([roc_auc1[0].values(),roc_auc2[0].values(),roc_auc3[0].values()], 
                 columns=columns_name, index=['train-set','val-set','test-set']).T

Unnamed: 0,train-set,val-set,test-set
roc_auc_micro,0.966993,0.902515,0.915871
roc_auc_fire_type_1,0.993008,0.94379,0.951037
roc_auc_fire_type_2,0.970422,0.703352,0.758444
roc_auc_fire_type_3,0.938675,0.836075,0.810555
roc_auc_fire_type_4,0.973151,0.948142,0.941603
roc_auc_fire_type_5,0.950698,0.809844,0.770216
roc_auc_fire_type_6,0.947462,0.872507,0.870064
roc_auc_fire_type_7,0.999843,0.0,0.0
roc_auc_fire_type_8,0.970901,0.922775,0.860657
roc_auc_fire_type_9,0.93317,0.833135,0.80352


#### NOTE: model performance on test-set has improved..

In [122]:
res = list(zip(clf.feature_names_, clf.feature_importances_))
res.sort(key=lambda x:x[1], reverse=True)

In [123]:
#top 20 features
res[:20]

[('num_cities_radius_15', 8.934596212171213),
 ('num_cities_radius_10', 5.52296684714963),
 ('federal_subject', 5.057482937923906),
 ('latitude', 5.048828128634322),
 ('district', 4.97617859090011),
 ('longitude', 4.1920675710475805),
 ('num_cities_radius_5', 3.196224645929514),
 ('weekofyear', 3.1494860877373494),
 ('latitude_y', 2.981973090240233),
 ('longitude_y', 2.3957993150057293),
 ('city_dst', 2.344293651671195),
 ('nature_forest_dst', 2.275533357062124),
 ('population', 2.0872948304467402),
 ('field_dst', 1.9270096486712753),
 ('rhum_2w', 1.615335906163404),
 ('forest_dst', 1.3982664627088222),
 ('rhum_1w', 1.3308413790306044),
 ('rhum_3w', 1.2345697498405572),
 ('num_forest_radius_1.0', 1.1538416467467187),
 ('is_spring', 1.0370118314616452)]