In [1]:
import numpy as np
import pandas as pd

import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

In [2]:
def scale(x_train, x_test):
    scaler = StandardScaler()
    scaler.fit(x_train)

    x_train = pd.DataFrame(scaler.transform(x_train), 
                           index=x_train.index, 
                           columns=x_train.columns)

    x_test = pd.DataFrame(scaler.transform(x_test),
                          index=x_test.index, 
                          columns=x_test.columns)
    return x_train, x_test

In [3]:
grid_search_rf = joblib.load('../Project_data/results/rf_gridsearch.joblib')
grid_search_rf.best_params_

{'randomforestclassifier__max_depth': 20,
 'randomforestclassifier__max_features': 12,
 'randomforestclassifier__n_estimators': 200}

In [4]:
parms = {key.split('__')[1]: grid_search_rf.best_params_[key] for key in grid_search_rf.best_params_}
parms

{'max_depth': 20, 'max_features': 12, 'n_estimators': 200}

In [5]:
rf_cls = RandomForestClassifier(**parms)

## 1. 5-classes classification:

### 1.1. Original set

In [6]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc.csv', index_col='building_id')

In [7]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [8]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [9]:
rf_cls.fit(x_train_norm, y_train.values.ravel())

RandomForestClassifier(max_depth=20, max_features=12, n_estimators=200)

In [12]:
prediction = rf_cls.predict(x_test_norm)
pred_proba = rf_cls.predict_proba(x_test_norm)

In [13]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.134774,0.396289,0.324884,0.101862,0.04219
1,Grade_2,0.014628,0.582772,0.315123,0.035472,0.052004
2,Grade_4,5.1e-05,0.000242,0.051585,0.787921,0.160202
3,Grade_3,0.00768,0.364599,0.37176,0.166813,0.089148
4,Grade_5,0.003049,0.014803,0.059563,0.178041,0.744544


In [14]:
preds.to_csv('../Project_data/results/rf_preds_orig.csv')

### 1.2 Resampled set:

#### 1.2.1 Over-sampling with SMOTENC

In [15]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc.csv', index_col=0).reset_index(drop=True)

  mask |= (ar1 == a)


In [16]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [17]:
rf_cls.fit(x_train_norm, y_train.values.ravel())

RandomForestClassifier(max_depth=20, max_features=12, n_estimators=200)

In [18]:
prediction = rf_cls.predict(x_test_norm)
pred_proba = rf_cls.predict_proba(x_test_norm)

In [19]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.17799,0.405769,0.274928,0.101591,0.039722
1,Grade_2,0.028066,0.6066,0.261088,0.053062,0.051185
2,Grade_4,0.000179,0.001563,0.068742,0.747326,0.182191
3,Grade_3,0.009645,0.261916,0.484374,0.176699,0.067365
4,Grade_5,0.008231,0.019447,0.063114,0.214929,0.69428


In [20]:
preds.to_csv('../Project_data/results/rf_preds_smotenc.csv')

### 1.2.2 Under-sampling: cleaning oversampled dataset

#### 1.2.2.1 Tomek

In [21]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)

In [22]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [23]:
rf_cls.fit(x_train_norm, y_train.values.ravel())

RandomForestClassifier(max_depth=20, max_features=12, n_estimators=200)

In [24]:
prediction = rf_cls.predict(x_test_norm)
pred_proba = rf_cls.predict_proba(x_test_norm)

In [25]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.158595,0.423571,0.278708,0.103554,0.035572
1,Grade_2,0.030266,0.658823,0.220038,0.045214,0.045658
2,Grade_4,0.000168,0.001396,0.046254,0.794892,0.15729
3,Grade_3,0.015337,0.241567,0.474387,0.206801,0.061908
4,Grade_5,0.004102,0.017287,0.069656,0.213652,0.695303


In [26]:
preds.to_csv('../Project_data/results/rf_preds_smotenc_tmk.csv')

#### 1.2.2.2 ENN

In [27]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)

In [28]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [29]:
rf_cls.fit(x_train_norm, y_train.values.ravel())

RandomForestClassifier(max_depth=20, max_features=12, n_estimators=200)

In [30]:
prediction = rf_cls.predict(x_test_norm)
pred_proba = rf_cls.predict_proba(x_test_norm)

In [31]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.195507,0.466018,0.285477,0.044221,0.008777
1,Grade_2,0.032746,0.775158,0.164029,0.017068,0.011
2,Grade_4,0.000248,0.002646,0.063797,0.776477,0.156831
3,Grade_3,0.01178,0.287457,0.494197,0.160647,0.04592
4,Grade_5,0.003725,0.06063,0.103358,0.231486,0.6008


In [32]:
preds.to_csv('../Project_data/results/rf_preds_smotenc_enn.csv')

## 2. 3-classes classification:

In [33]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc_3lab.csv', index_col='building_id')

In [34]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [35]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [36]:
rf_cls.fit(x_train_norm, y_train.values.ravel())

RandomForestClassifier(max_depth=20, max_features=12, n_estimators=200)

In [37]:
prediction = rf_cls.predict(x_test_norm)
pred_proba = rf_cls.predict_proba(x_test_norm)

In [38]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3
0,G1,0.564684,0.389445,0.045871
1,G1,0.596692,0.338251,0.065057
2,G2,0.001285,0.855451,0.143264
3,G2,0.392052,0.517239,0.090708
4,G3,0.018338,0.230183,0.751479


In [39]:
preds.to_csv('../Project_data/results/rf_preds_3cls.csv')