In [None]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [None]:
# Load data
bank1 = pd.read_csv("bank1.csv")

In [None]:
# Define roles for diamantes1.csv
y = np.where(bank1['y']=='yes',1,0)
X = bank1.drop(bank1.columns[[0,9]],axis=1)

In [None]:
# Define the preprocessing pipeline
categorical_features = X.select_dtypes(include=['object','category']).columns
numeric_features = X.select_dtypes(exclude=['object','category']).columns

preprocessor = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features)
        ],
        remainder='passthrough'
)

In [None]:
# Combine preprocessing model and the logistic regression model into a single pipeline
modelo_tree = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('tree',DecisionTreeClassifier(max_depth=5))
])

In [None]:
# Setup 10-fold stratified cross-validation
random_seed = 1
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=random_seed)
# Try both scoring = 'roc_auc' and scoring = 'accuracy'
scores = cross_val_score(modelo_tree,X,y,cv=kf,scoring='roc_auc')
np.mean(scores)

In [None]:
# Setup 10-fold stratified cross-validation
random_seed = 1
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=random_seed)

scoring = ['accuracy','roc_auc']
results = cross_validate(modelo_tree,X,y,cv=kf,scoring=scoring) 

In [None]:
print("Mean Accuracy:", np.mean(results['test_accuracy']))
print("Mean AUC:", np.mean(results['test_roc_auc']))

In [None]:
modelo_tree.fit(X,y)

In [None]:
bank2 = pd.read_csv("bank2.csv")

In [None]:
newy = np.where(bank2['y']=='yes',1,0)
newX = bank2.drop(bank2.columns[[0,9]],axis=1)

In [None]:
y_pred = modelo_tree.predict(newX)
probs_pred = modelo_tree.predict_proba(newX)[:,1]

In [None]:
print("Accuracy:",accuracy_score(newy,y_pred))
print("Confusion Matrix:\n",confusion_matrix(newy,y_pred))
print("AUC:",roc_auc_score(newy,probs_pred))