Excercise: Random Forest

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

In [3]:
df_bank_load = pd.read_csv('bankloan.csv')
df_bank_load.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


In [4]:
data_classification = df_bank_load[['default', 'age', 'employ', 'debtinc', 'creddebt', 'othdebt']].copy()
data_classification.head()

Unnamed: 0,default,age,employ,debtinc,creddebt,othdebt
0,1,41,17,9.3,11.359392,5.008608
1,0,27,10,17.3,1.362202,4.000798
2,0,40,15,5.5,0.856075,2.168925
3,0,41,15,2.9,2.65872,0.82128
4,1,24,2,17.3,1.787436,3.056564


In [5]:
X = data_classification[['age', 'employ', 'debtinc', 'creddebt', 'othdebt']]
y = data_classification['default']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2020
)

In [10]:
RFCModel = RandomForestClassifier(
    n_estimators=20,
    max_features=4,
    max_depth=3,
    random_state=2020
).fit(X= X_train, y = y_train)

In [11]:
y_pred = RFCModel.predict(X_test)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.79      0.95      0.86       103
           1       0.69      0.30      0.42        37

    accuracy                           0.78       140
   macro avg       0.74      0.62      0.64       140
weighted avg       0.76      0.78      0.74       140



Pickle or Dump The Model into File

In [16]:
# Cara 1: Turn into Pickle

with open('RFCMODEL_PICKLE.pkl', 'wb') as f:
    pickle.dump(
        RFCModel, f
    )

---------------------------------

Excercise: Apply several preprocessing method 

In [17]:
df_adult = pd.read_csv('adult.csv')
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [18]:
for col in df_adult.columns:
    df_adult[col] = df_adult[col].apply(lambda x: np.NaN if x=="?" else x)

In [21]:
categorical_columns = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']
label_encoders = {col: LabelEncoder() for col in categorical_columns}

In [22]:
for col in categorical_columns:
    df_adult[col] = label_encoders[col].fit_transform(df_adult[col])

In [23]:
X = df_adult.drop('income', axis=1)
y = df_adult['income']

In [24]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [25]:
X_scaled

array([[ 3.76961234,  2.92262342, -1.06799736, ..., 10.59350656,
        -0.03542945,  0.25137765],
       [ 3.18311167, -0.23790601, -0.53916866, ..., 10.59350656,
        -1.81720429,  0.25137765],
       [ 2.01011032,  2.92262342, -0.03521956, ..., 10.59350656,
        -0.03542945,  0.25137765],
       ...,
       [ 0.10398314, -0.23790601, -0.33543266, ..., -0.21665953,
        -0.03542945,  0.25137765],
       [ 1.42360965, -0.23790601, -0.35877741, ..., -0.21665953,
        -0.03542945,  0.25137765],
       [-1.21564337, -0.23790601,  0.11095988, ..., -0.21665953,
        -1.65522476,  0.25137765]])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=10)

Decision Tree

In [27]:
tree_model = DecisionTreeClassifier(max_depth=3, random_state=10)
tree_model.fit(X_train, y_train)

In [28]:
y_pred_DTC = tree_model.predict(X_test)

In [29]:
print(classification_report(y_true=y_test, y_pred=y_pred_DTC))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90      7417
           1       0.78      0.47      0.59      2352

    accuracy                           0.84      9769
   macro avg       0.82      0.72      0.75      9769
weighted avg       0.83      0.84      0.83      9769



In [30]:
print("Feature Importances:", tree_model.feature_importances_)
print("Decision Tree Structure:")
print(export_text(tree_model, feature_names=X.columns.tolist()))

Feature Importances: [0.         0.         0.00081193 0.         0.19519866 0.
 0.         0.52309743 0.         0.         0.28089197 0.
 0.         0.        ]
Decision Tree Structure:
|--- relationship <= -0.59
|   |--- education.num <= 0.94
|   |   |--- capital.gain <= 0.54
|   |   |   |--- class: 0
|   |   |--- capital.gain >  0.54
|   |   |   |--- class: 1
|   |--- education.num >  0.94
|   |   |--- capital.gain <= 0.54
|   |   |   |--- class: 1
|   |   |--- capital.gain >  0.54
|   |   |   |--- class: 1
|--- relationship >  -0.59
|   |--- capital.gain <= 0.81
|   |   |--- relationship <= 1.90
|   |   |   |--- class: 0
|   |   |--- relationship >  1.90
|   |   |   |--- class: 0
|   |--- capital.gain >  0.81
|   |   |--- fnlwgt <= -1.55
|   |   |   |--- class: 0
|   |   |--- fnlwgt >  -1.55
|   |   |   |--- class: 1



Ada Boost

In [33]:
adaboost_model = adaboost_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=3), n_estimators=200, learning_rate=0.1, random_state=10)
adaboost_model.fit(X_train, y_train)



In [37]:
y_pred_AB = adaboost_model.predict(X_test)

In [38]:
print(classification_report(y_true=y_test, y_pred=y_pred_AB))

              precision    recall  f1-score   support

           0       0.89      0.94      0.92      7417
           1       0.78      0.64      0.71      2352

    accuracy                           0.87      9769
   macro avg       0.84      0.79      0.81      9769
weighted avg       0.87      0.87      0.87      9769



In [39]:
print("Feature Importances:", adaboost_model.feature_importances_)

Feature Importances: [0.13649839 0.05289973 0.11565144 0.03750863 0.07447146 0.05988547
 0.10700641 0.07250793 0.02290797 0.02374846 0.08614763 0.07708701
 0.10172005 0.03195942]


Gradient Boosting

In [40]:
gbc_model = GradientBoostingClassifier(max_depth=3, n_estimators=200, learning_rate=0.1, random_state=10)
gbc_model.fit(X_train, y_train)

In [41]:
y_pred_GBC = gbc_model.predict(X_test)

In [42]:
print(classification_report(y_true=y_test, y_pred=y_pred_GBC))
print("Feature Importances:", gbc_model.feature_importances_)



              precision    recall  f1-score   support

           0       0.89      0.95      0.92      7417
           1       0.79      0.62      0.70      2352

    accuracy                           0.87      9769
   macro avg       0.84      0.78      0.81      9769
weighted avg       0.86      0.87      0.86      9769

Feature Importances: [0.06478963 0.00816031 0.01027994 0.00089552 0.19696864 0.03060697
 0.02440901 0.31788885 0.00078505 0.00591646 0.22634285 0.06730587
 0.04317188 0.00247903]


XGBoost

In [43]:
xgb_model = XGBClassifier(max_depth=3, n_estimators=200, learning_rate=0.1, random_state=10, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [44]:
y_pred_XGB = xgb_model.predict(X_test)

In [45]:
print(classification_report(y_true=y_test, y_pred=y_pred_XGB))
print("Feature Importances:", xgb_model.feature_importances_)

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      7417
           1       0.79      0.61      0.69      2352

    accuracy                           0.87      9769
   macro avg       0.84      0.78      0.80      9769
weighted avg       0.86      0.87      0.86      9769

Feature Importances: [0.06370445 0.01584629 0.00837297 0.00323457 0.16518496 0.13573861
 0.03523317 0.32974172 0.00762307 0.04651454 0.09142463 0.04732087
 0.04195013 0.00811004]


Hyperparameter Tuning for AdaBoost

In [50]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'estimator__max_depth': [2, 3, 4]
}

In [52]:
grid_search = GridSearchCV(AdaBoostClassifier(estimator=DecisionTreeClassifier()), 
                           param_grid, scoring='f1', cv=5, verbose=1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_adaboost_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits




Best Parameters: {'estimator__max_depth': 2, 'learning_rate': 0.2, 'n_estimators': 300}


In [None]:
print(classification_report(best_adaboost_model))