# Random Forest

In [45]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report


In [46]:
# Load the data
train = pd.read_csv('data/train_rose.csv')
test = pd.read_csv('data/test.csv')

In [47]:
X_train = train.drop('CARAVAN', axis=1)
y_train = train['CARAVAN']
X_test = test.drop('CARAVAN', axis=1)
y_test = test['CARAVAN']

In [48]:
# Define a param grid for the randomForest

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
               }

In [49]:
cv = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=random_grid, n_iter=100, cv=3, verbose=10, random_state=42, n_jobs=-1)
cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 1/3; 1/100] START max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=600
[CV 2/3; 1/100] START max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=600
[CV 3/3; 1/100] START max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=600
[CV 1/3; 2/100] START max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1000
[CV 2/3; 2/100] START max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1000
[CV 3/3; 2/100] START max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1000
[CV 1/3; 3/100] START max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600
[CV 2/3; 3/100] START max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600
[CV 3/3; 1/100] END max_dept

In [50]:
# Test the best model on the test set
best_model = cv.best_estimator_
y_pred = best_model.predict(X_test)

In [53]:
# Confusion matrix and classification report
# Tabulate the confusion matrix and write that columns are the predicted values and rows are the true values
conf = confusion_matrix(y_test, y_pred)
print('\n\nConfusion matrix\n')
print(tabulate(conf, headers=['Predicted 0', 'Predicted 1'], showindex=['True 0', 'True 1'], tablefmt='pretty'))

# Print the classification report
print('\n\nClassification report\n')
print(classification_report(y_test, y_pred))





Confusion matrix

+--------+-------------+-------------+
|        | Predicted 0 | Predicted 1 |
+--------+-------------+-------------+
| True 0 |    3618     |     144     |
| True 1 |     211     |     27      |
+--------+-------------+-------------+


Classification report

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      3762
           1       0.16      0.11      0.13       238

    accuracy                           0.91      4000
   macro avg       0.55      0.54      0.54      4000
weighted avg       0.90      0.91      0.90      4000



In [52]:
#Print the best parameters
cv.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 50}

In [55]:
# Get the feature importances
feature_importances = best_model.feature_importances_
# Print the feature importances sorted by importance
print('\n\nFeature importances\n')
print(tabulate(sorted(zip(X_train.columns, feature_importances), key=lambda x: x[1], reverse=True), headers=['Feature', 'Importance'], tablefmt='pretty'))



Feature importances

+------------+------------------------+
|  Feature   |       Importance       |
+------------+------------------------+
|  PPERSAUT  |  0.10401701539903879   |
|   PBRAND   |  0.07627526004755099   |
|  PWAPART   |  0.038675459533560705  |
|  MKOOPKLA  |  0.02937235273746822   |
|  MINKM30   |  0.026702529097447456  |
|  MINKGEM   |  0.026065226282519796  |
|   MHHUUR   |  0.02571056266253889   |
|  MBERARBG  |  0.024719920517648777  |
|   MAUT1    |  0.024698223233668732  |
|   MRELGE   |  0.024418504247675173  |
|  MINK4575  |  0.02368482736759701   |
|   MGODPR   |  0.023463184616985076  |
|  MBERMIDD  |  0.02328950333020787   |
|  MINK3045  |  0.023029317627660716  |
|    MSKC    |  0.02267656801430249   |
|  MOPLMIDD  |  0.02228639246226223   |
|  MFGEKIND  |  0.021168190999645897  |
|  MZFONDS   |  0.02047551916319861   |
|  MBERARBO  |  0.020308002930281677  |
|  MOPLHOOG  |  0.020100557251212542  |
|  MBERHOOG  |  0.019759553175919063  |
|    MSKA    |  0

In [58]:
# Fit a decision tree using only the first 5 most important features
from sklearn.tree import DecisionTreeClassifier
# Get the 5 most important features
top5_features = [x[0] for x in sorted(zip(X_train.columns, feature_importances), key=lambda x: x[1], reverse=True)[:10]]
# Fit the model
tree = DecisionTreeClassifier()
tree.fit(X_train[top5_features], y_train)
# Get the predictions
y_pred_tree = tree.predict(X_test[top5_features])
# Confusion matrix and classification report
# Tabulate the confusion matrix and write that columns are the predicted values and rows are the true values
conf_tree = confusion_matrix(y_test, y_pred_tree)
print('\n\nConfusion matrix for the decision tree\n')
print(tabulate(conf_tree, headers=['Predicted 0', 'Predicted 1'], showindex=['True 0', 'True 1'], tablefmt='pretty'))
print('\n\nClassification report for the decision tree\n')
print(classification_report(y_test, y_pred_tree))



Confusion matrix for the decision tree

+--------+-------------+-------------+
|        | Predicted 0 | Predicted 1 |
+--------+-------------+-------------+
| True 0 |    3456     |     306     |
| True 1 |     197     |     41      |
+--------+-------------+-------------+


Classification report for the decision tree

              precision    recall  f1-score   support

           0       0.95      0.92      0.93      3762
           1       0.12      0.17      0.14       238

    accuracy                           0.87      4000
   macro avg       0.53      0.55      0.54      4000
weighted avg       0.90      0.87      0.89      4000

