In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [2]:
# Load data
bank1 = pd.read_csv("bank1.csv")

In [3]:
# Define roles for diamantes1.csv
y = np.where(bank1['y']=='yes',1,0)
X = bank1.drop(bank1.columns[[0,9]],axis=1)

In [4]:
# Define the preprocessing pipeline
categorical_features = X.select_dtypes(include=['object','category']).columns
numeric_features = X.select_dtypes(exclude=['object','category']).columns

preprocessor = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features)
        ],
    remainder = 'passthrough'
)

In [5]:
# Combine preprocessing model and the knn model into a single pipeline
param_grid = {'forest__n_estimators': [10,100,500],'forest__max_features': [4,5,6]}

modelo_forest = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('forest',RandomForestClassifier())
])

In [7]:
%%time
# Setup 10-fold stratified cross-validation
random_seed = 1
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=random_seed)

# Define score metric. It can be either 'accuracy' or 'roc_auc'

scoring = 'accuracy'

# Define grid

model_forest_grid = GridSearchCV(modelo_forest,param_grid,cv=kf,scoring=scoring,n_jobs=-1)

#Estimate best model

model_forest_grid.fit(X,y)



CPU times: user 2.91 s, sys: 577 ms, total: 3.48 s
Wall time: 47.9 s


In [8]:
resultados=pd.DataFrame(model_forest_grid.cv_results_)
resultados.loc[:,['param_forest__n_estimators','param_forest__max_features','mean_test_score','std_test_score']]

Unnamed: 0,param_forest__n_estimators,param_forest__max_features,mean_test_score,std_test_score
0,10,4,0.871793,0.001429
1,100,4,0.87171,0.002156
2,500,4,0.871848,0.002339
3,10,5,0.870383,0.003401
4,100,5,0.872899,0.003231
5,500,5,0.872788,0.002831
6,10,6,0.869691,0.00236
7,100,6,0.873673,0.002567
8,500,6,0.873618,0.002071


In [9]:
bank2 = pd.read_csv("bank2.csv")

In [10]:
newy = np.where(bank2['y']=='yes',1,0)
newX = bank2.drop(bank2.columns[[0,9]],axis=1)

In [12]:
y_pred = model_forest_grid.predict(newX)
probs_pred = model_forest_grid.predict_proba(newX)[:,1]


In [13]:
print("Accuracy:",accuracy_score(newy,y_pred))
print("Confusion Matrix:\n",confusion_matrix(newy,y_pred))
print("AUC:",roc_auc_score(newy,probs_pred))

Accuracy: 0.8747097202255889
Confusion Matrix:
 [[7644  341]
 [ 792  266]]
AUC: 0.6854104399435141
