In [1]:
import numpy as np
import pandas as pd

import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

In [2]:
def scale(x_train, x_test):
    scaler = StandardScaler()
    scaler.fit(x_train)

    x_train = pd.DataFrame(scaler.transform(x_train), 
                           index=x_train.index, 
                           columns=x_train.columns)

    x_test = pd.DataFrame(scaler.transform(x_test),
                          index=x_test.index, 
                          columns=x_test.columns)
    return x_train, x_test

In [12]:
parms = joblib.load('../Project_data/results/xgb_gridsearch.joblib')
parms.best_params_.update({'n_estimators': 500})
parms.best_params_

{'learning_rate': 0.15, 'max_depth': 10, 'n_estimators': 500}

In [13]:
xgb_cls = XGBClassifier(**parms.best_params_)

## 1. 5-classes classification:

### 1.1. Original set

In [14]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc.csv', index_col='building_id')

In [15]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [16]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [17]:
fit_params = {'early_stopping_rounds': 10,
              'eval_metric': 'merror',
              'eval_set': [(x_test_norm, y_test.values.ravel())]}

xgb_cls.fit(x_train_norm, y_train.values.ravel(), **fit_params)



[0]	validation_0-merror:0.43857
[1]	validation_0-merror:0.43445
[2]	validation_0-merror:0.43233
[3]	validation_0-merror:0.43151
[4]	validation_0-merror:0.42985
[5]	validation_0-merror:0.42957
[6]	validation_0-merror:0.42960
[7]	validation_0-merror:0.42902
[8]	validation_0-merror:0.42832
[9]	validation_0-merror:0.42804
[10]	validation_0-merror:0.42778
[11]	validation_0-merror:0.42783
[12]	validation_0-merror:0.42730
[13]	validation_0-merror:0.42699
[14]	validation_0-merror:0.42657
[15]	validation_0-merror:0.42596
[16]	validation_0-merror:0.42568
[17]	validation_0-merror:0.42558
[18]	validation_0-merror:0.42529
[19]	validation_0-merror:0.42466
[20]	validation_0-merror:0.42451
[21]	validation_0-merror:0.42413
[22]	validation_0-merror:0.42348
[23]	validation_0-merror:0.42301
[24]	validation_0-merror:0.42285
[25]	validation_0-merror:0.42259
[26]	validation_0-merror:0.42240
[27]	validation_0-merror:0.42200
[28]	validation_0-merror:0.42196
[29]	validation_0-merror:0.42155
[30]	validation_0-me

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
prediction = xgb_cls.predict(x_test_norm)
pred_proba = xgb_cls.predict_proba(x_test_norm)

In [19]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_3,0.137692,0.332427,0.339567,0.164475,0.025838
1,Grade_2,0.009471,0.616169,0.234104,0.055375,0.084881
2,Grade_4,0.000198,0.000331,0.048742,0.737517,0.213213
3,Grade_3,0.00254,0.231842,0.466413,0.226172,0.073034
4,Grade_5,0.002778,0.013935,0.060684,0.166124,0.75648


In [20]:
preds.to_csv('../Project_data/results/xgb_preds_orig.csv')

### 1.2 Resampled set:

#### 1.2.1 Over-sampling with SMOTENC

In [21]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc.csv', index_col=0).reset_index(drop=True)

  mask |= (ar1 == a)


In [22]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [23]:
fit_params = {'early_stopping_rounds': 10,
              'eval_metric': 'merror',
              'eval_set': [(x_test_norm, y_test.values.ravel())]}

xgb_cls.fit(x_train_norm, y_train.values.ravel(), **fit_params)



[0]	validation_0-merror:0.45130
[1]	validation_0-merror:0.44918
[2]	validation_0-merror:0.44764
[3]	validation_0-merror:0.44666
[4]	validation_0-merror:0.44510
[5]	validation_0-merror:0.44395
[6]	validation_0-merror:0.44353
[7]	validation_0-merror:0.44333
[8]	validation_0-merror:0.44210
[9]	validation_0-merror:0.44173
[10]	validation_0-merror:0.44126
[11]	validation_0-merror:0.44105
[12]	validation_0-merror:0.44055
[13]	validation_0-merror:0.44031
[14]	validation_0-merror:0.43993
[15]	validation_0-merror:0.43969
[16]	validation_0-merror:0.43926
[17]	validation_0-merror:0.43897
[18]	validation_0-merror:0.43853
[19]	validation_0-merror:0.43828
[20]	validation_0-merror:0.43804
[21]	validation_0-merror:0.43783
[22]	validation_0-merror:0.43747
[23]	validation_0-merror:0.43697
[24]	validation_0-merror:0.43683
[25]	validation_0-merror:0.43664
[26]	validation_0-merror:0.43592
[27]	validation_0-merror:0.43584
[28]	validation_0-merror:0.43571
[29]	validation_0-merror:0.43541
[30]	validation_0-me

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
prediction = xgb_cls.predict(x_test_norm)
pred_proba = xgb_cls.predict_proba(x_test_norm)

In [25]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.093764,0.397593,0.373671,0.118386,0.016586
1,Grade_2,0.009826,0.624836,0.252301,0.048011,0.065027
2,Grade_4,0.000228,0.000541,0.05833,0.76064,0.18026
3,Grade_3,0.001896,0.317677,0.448785,0.168983,0.062659
4,Grade_5,0.002568,0.015089,0.028857,0.136376,0.81711


In [26]:
preds.to_csv('../Project_data/results/xgb_preds_smotenc.csv')

### 1.2.2 Under-sampling: cleaning oversampled dataset

#### 1.2.2.1 Tomek

In [27]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)

In [28]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [29]:
fit_params = {'early_stopping_rounds': 10,
              'eval_metric': 'merror',
              'eval_set': [(x_test_norm, y_test.values.ravel())]}

xgb_cls.fit(x_train_norm, y_train.values.ravel(), **fit_params)



[0]	validation_0-merror:0.45185
[1]	validation_0-merror:0.44811
[2]	validation_0-merror:0.44779
[3]	validation_0-merror:0.44645
[4]	validation_0-merror:0.44502
[5]	validation_0-merror:0.44458
[6]	validation_0-merror:0.44415
[7]	validation_0-merror:0.44395
[8]	validation_0-merror:0.44291
[9]	validation_0-merror:0.44293
[10]	validation_0-merror:0.44220
[11]	validation_0-merror:0.44180
[12]	validation_0-merror:0.44122
[13]	validation_0-merror:0.44063
[14]	validation_0-merror:0.44023
[15]	validation_0-merror:0.44018
[16]	validation_0-merror:0.43992
[17]	validation_0-merror:0.43961
[18]	validation_0-merror:0.43914
[19]	validation_0-merror:0.43887
[20]	validation_0-merror:0.43838
[21]	validation_0-merror:0.43790
[22]	validation_0-merror:0.43758
[23]	validation_0-merror:0.43716
[24]	validation_0-merror:0.43699
[25]	validation_0-merror:0.43689
[26]	validation_0-merror:0.43689
[27]	validation_0-merror:0.43641
[28]	validation_0-merror:0.43627
[29]	validation_0-merror:0.43597
[30]	validation_0-me

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
prediction = xgb_cls.predict(x_test_norm)
pred_proba = xgb_cls.predict_proba(x_test_norm)

In [31]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.1146,0.500445,0.226638,0.141719,0.016598
1,Grade_2,0.00749,0.676385,0.244202,0.027006,0.044918
2,Grade_4,0.00022,0.000495,0.050688,0.790025,0.158572
3,Grade_2,0.001748,0.395648,0.347273,0.210898,0.044433
4,Grade_5,0.002579,0.012624,0.039986,0.145325,0.799486


In [32]:
preds.to_csv('../Project_data/results/xgb_preds_smotenc_tmk.csv')

#### 1.2.2.2 ENN

In [33]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)

In [34]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [35]:
fit_params = {'early_stopping_rounds': 10,
              'eval_metric': 'merror',
              'eval_set': [(x_test_norm, y_test.values.ravel())]}

xgb_cls.fit(x_train_norm, y_train.values.ravel(), **fit_params)



[0]	validation_0-merror:0.47747
[1]	validation_0-merror:0.47326
[2]	validation_0-merror:0.47242
[3]	validation_0-merror:0.47159
[4]	validation_0-merror:0.47005
[5]	validation_0-merror:0.46996
[6]	validation_0-merror:0.46952
[7]	validation_0-merror:0.46920
[8]	validation_0-merror:0.46857
[9]	validation_0-merror:0.46781
[10]	validation_0-merror:0.46751
[11]	validation_0-merror:0.46725
[12]	validation_0-merror:0.46688
[13]	validation_0-merror:0.46705
[14]	validation_0-merror:0.46653
[15]	validation_0-merror:0.46601
[16]	validation_0-merror:0.46575
[17]	validation_0-merror:0.46579
[18]	validation_0-merror:0.46566
[19]	validation_0-merror:0.46571
[20]	validation_0-merror:0.46534
[21]	validation_0-merror:0.46509
[22]	validation_0-merror:0.46480
[23]	validation_0-merror:0.46476
[24]	validation_0-merror:0.46461
[25]	validation_0-merror:0.46415
[26]	validation_0-merror:0.46417
[27]	validation_0-merror:0.46395
[28]	validation_0-merror:0.46385
[29]	validation_0-merror:0.46360
[30]	validation_0-me

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [36]:
prediction = xgb_cls.predict(x_test_norm)
pred_proba = xgb_cls.predict_proba(x_test_norm)

In [37]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.097386,0.610507,0.223499,0.061304,0.007304
1,Grade_2,0.010735,0.810007,0.166986,0.004813,0.007459
2,Grade_4,0.000234,0.00114,0.039925,0.764787,0.193914
3,Grade_2,0.005407,0.448133,0.402688,0.131944,0.011828
4,Grade_5,0.005354,0.049275,0.070716,0.274672,0.599983


In [38]:
preds.to_csv('../Project_data/results/xgb_preds_smotenc_enn.csv')

## 2. 3-classes classification:

In [39]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc_3lab.csv', index_col='building_id')

In [40]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [41]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [42]:
fit_params = {'early_stopping_rounds': 10,
              'eval_metric': 'merror',
              'eval_set': [(x_test_norm, y_test.values.ravel())]}

xgb_cls.fit(x_train_norm, y_train.values.ravel(), **fit_params)



[0]	validation_0-merror:0.30246
[1]	validation_0-merror:0.30088
[2]	validation_0-merror:0.29867
[3]	validation_0-merror:0.29808
[4]	validation_0-merror:0.29702
[5]	validation_0-merror:0.29662
[6]	validation_0-merror:0.29647
[7]	validation_0-merror:0.29612
[8]	validation_0-merror:0.29590
[9]	validation_0-merror:0.29537
[10]	validation_0-merror:0.29486
[11]	validation_0-merror:0.29433
[12]	validation_0-merror:0.29416
[13]	validation_0-merror:0.29368
[14]	validation_0-merror:0.29354
[15]	validation_0-merror:0.29360
[16]	validation_0-merror:0.29349
[17]	validation_0-merror:0.29320
[18]	validation_0-merror:0.29289
[19]	validation_0-merror:0.29255
[20]	validation_0-merror:0.29229
[21]	validation_0-merror:0.29225
[22]	validation_0-merror:0.29201
[23]	validation_0-merror:0.29177
[24]	validation_0-merror:0.29132
[25]	validation_0-merror:0.29104
[26]	validation_0-merror:0.29106
[27]	validation_0-merror:0.29101
[28]	validation_0-merror:0.29082
[29]	validation_0-merror:0.29067
[30]	validation_0-me

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [43]:
prediction = xgb_cls.predict(x_test_norm)
pred_proba = xgb_cls.predict_proba(x_test_norm)

In [44]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3
0,G2,0.486216,0.487257,0.026527
1,G1,0.620202,0.26828,0.111518
2,G2,0.000165,0.790298,0.209537
3,G2,0.230765,0.705437,0.063797
4,G3,0.018785,0.218215,0.763


In [45]:
preds.to_csv('../Project_data/results/xgb_preds_3cls.csv')