In [4]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [2]:
# Load data
bank1 = pd.read_csv("bank1.csv")

In [3]:
# Define roles for diamantes1.csv
y = np.where(bank1['y']=='yes',1,0)
X = bank1.drop(bank1.columns[[0,9]],axis=1)

In [5]:
# Define the preprocessing pipeline
categorical_features = X.select_dtypes(include=['object','category']).columns
numeric_features = X.select_dtypes(exclude=['object','category']).columns

preprocessor = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features),
            ('num',StandardScaler(),numeric_features)
        ]
)

In [7]:
# Combine preprocessing model and the knn model into a single pipeline
param_grid = {'knn__n_neighbors': [1,5,10,30]}

modelo_knn = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('knn',KNeighborsClassifier())
])

In [9]:
# Setup 10-fold stratified cross-validation
random_seed = 1
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=random_seed)

# Define score metric. It can be either 'accuracy' or 'roc_auc'

scoring = 'accuracy'

# Define grid

model_knn_grid = GridSearchCV(modelo_knn,param_grid,cv=kf,scoring=scoring)

#Estimate best model

model_knn_grid.fit(X,y)

In [11]:
resultados=pd.DataFrame(model_knn_grid.cv_results_)
resultados.loc[:,['param_knn__n_neighbors','mean_test_score','std_test_score']]

Unnamed: 0,param_knn__n_neighbors,mean_test_score,std_test_score
0,1,0.833444,0.003541
1,5,0.873092,0.003565
2,10,0.881055,0.001452
3,30,0.882382,0.001007


In [12]:
bank2 = pd.read_csv("bank2.csv")

In [13]:
newy = np.where(bank2['y']=='yes',1,0)
newX = bank2.drop(bank2.columns[[0,9]],axis=1)

In [15]:
y_pred = model_knn_grid.predict(newX)
probs_pred = model_knn_grid.predict_proba(newX)[:,1]


In [16]:
print("Accuracy:",accuracy_score(newy,y_pred))
print("Confusion Matrix:\n",confusion_matrix(newy,y_pred))
print("AUC:",roc_auc_score(newy,probs_pred))

Accuracy: 0.8804600243282097
Confusion Matrix:
 [[7938   47]
 [1034   24]]
AUC: 0.6730451591062164
