In [2]:
import time
import pandas as pd
import numpy as np
from sklearn import preprocessing
from time import strptime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

In [4]:
df = pd.read_csv('bank-additional-full.csv')
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


To ensure consistency and reproducibility across all team members, we applied the same train-test split approach using a fixed random seed. This helps control for variability in results due to random sampling.

Additionally, we converted the training dataset into a numerical array to prepare it for model training. This transformation ensures compatibility with machine learning algorithms, which typically require numeric input features.

In [5]:
from sklearn.model_selection import train_test_split

df.loc[(df.y == 'yes'),'y'] = 1
df.loc[(df.y == 'no'),'y']= 0
df['y'] = df['y'].astype(int)

bank_data = df.iloc[:, :-1]
bank_y = df['y'].astype(int)


X_train, X_test, Y_train, Y_test = train_test_split(bank_data, bank_y, test_size=0.2,
                                                    random_state=7,stratify = bank_y)

X_test_copy = X_test
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

feature_list = list(X_train.columns)

X_train = np.array(X_train)
X_test = np.array(X_test)

First, a simple baseline model is fitted to establish a reference point for accuracy. This benchmark helps gauge performance and sets a target for improvement with more advanced models.

In [6]:
rf_b  = RandomForestRegressor(n_estimators = 1000, random_state = 42)

pre_temp = time.time()
rf_b.fit(X_train,Y_train)
post_temp = time.time()

print("Time elapsed (real): ", post_temp-pre_temp)

Time elapsed (real):  163.2210657596588


In [8]:
from sklearn import metrics
from sklearn.metrics import classification_report
import numpy as np

def evaluate(rf, X_test, Y_test):
    predictions = rf.predict(X_test)
    
    # Calculate AUC
    auc = metrics.roc_auc_score(Y_test, predictions)
    
    print('Model Performance:')
    print(f'Area under ROC: {auc*100:.2f}%')
    print(f'Gini Coefficient / Somers D: {(2*auc-1)*100:.2f}%')
    
    # Generate Classification Report
    predictions_discrete = np.where(predictions > 0.5, 1, 0)
    print('Classification Report (using 50% probability cut-off):')
    print(classification_report(Y_test, predictions_discrete))
    
    # Calculate Accuracy
    accuracy = metrics.accuracy_score(Y_test, predictions_discrete)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    return accuracy

# Call the function and store accuracy
accuracy = evaluate(rf_b, X_test, Y_test)


Model Performance:
Area under ROC: 94.57%
Gini Coefficient / Somers D: 89.14%
Classification Report (using 50% probability cut-off):
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      7310
           1       0.64      0.53      0.58       928

    accuracy                           0.91      8238
   macro avg       0.79      0.75      0.77      8238
weighted avg       0.91      0.91      0.91      8238

Accuracy: 91.37%


Scikit-learn's Random Forest model has 16 hyperparameters that can be fine-tuned to optimize performance. While brute-force experimentation could be used to test each parameter individually, the sheer number of possible combinations—especially with continuous inputs—quickly becomes impractical, often exceeding a million iterations even with reasonable step sizes.

However, not all hyperparameters have a significant impact on model performance. According to documentation and existing literature, **the number of trees in the forest (`n_estimators`)** and **the number of features considered for each split (`max_features`)** are particularly influential in determining model accuracy and efficiency.

Below is a full list of hyperparameters available for tuning:

In [10]:
print('Parameters in algorithm:\n')
pprint(rf_b.get_params())

Parameters in algorithm:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}
