In [17]:
# Import pandas 
import pandas as pd
import numpy as np
import warnings


# Load the dataset
df = pd.read_excel('C:\\Users\\a1787\\Documents\\00term4\\1(4880)IntroductiontoDataAnalysis\\assignment2\\Bank_Personal_Loan_Modelling.xlsx', sheet_name='Data')

# See the top-5 records in the data
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [25]:
# Separate features and target variable
X = df.copy()       # Create separate copy to prevent unwanted tampering of data.
del X['Personal Loan']     # Delete target variable.
del X['ID']
del X['ZIP Code']
del X['Age']
del X['Experience']

# Target variable
y = df['Personal Loan']
# print(X.describe())

# Replacing negative values with 0
X[X < 0] = 0

X['IncomeBin']   = pd.cut(x=df['Income'], bins=[0,116, 225])
X = pd.get_dummies(X, columns=['IncomeBin'])

print(X.describe())

            Income       Family        CCAvg    Education     Mortgage  \
count  5000.000000  5000.000000  5000.000000  5000.000000  5000.000000   
mean     73.774200     2.396400     1.937913     1.881000    56.498800   
std      46.033729     1.147663     1.747666     0.839869   101.713802   
min       8.000000     1.000000     0.000000     1.000000     0.000000   
25%      39.000000     1.000000     0.700000     1.000000     0.000000   
50%      64.000000     2.000000     1.500000     2.000000     0.000000   
75%      98.000000     3.000000     2.500000     3.000000   101.000000   
max     224.000000     4.000000    10.000000     3.000000   635.000000   

       Securities Account  CD Account       Online   CreditCard  
count         5000.000000  5000.00000  5000.000000  5000.000000  
mean             0.104400     0.06040     0.596800     0.294000  
std              0.305809     0.23825     0.490589     0.455637  
min              0.000000     0.00000     0.000000     0.000000  
25%

In [19]:
# Import train_test_split function from 
from sklearn.model_selection import train_test_split
 
# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [26]:
# Import KNN model
from sklearn.neighbors import KNeighborsClassifier
from   sklearn  import metrics
 
# Create a KNN classifier object
model = KNeighborsClassifier(n_neighbors=17)
 
# Train the model using the training dataset
model.fit(X_train,y_train)
 
# Predict the target variable for test dataset
predictions = model.predict(X_test)
 
# Import metrics module for performance evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

 
# 모델의 양성 클래스에 대한 예측된 확률 계산
predictions_prob = model.predict_proba(X_test)[:, 1]
RMSE = np.sqrt(metrics.mean_squared_error(y_test, predictions))

# roc_auc_score 함수에 실제 레이블과 예측된 확률 전달
auc = roc_auc_score(y_test, predictions_prob)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, predictions))
# Calculate model precision
print("Precision:",precision_score(y_test, predictions))
# Calculate model recall
print("Recall:",recall_score(y_test, predictions))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, predictions))
# Calculate model AUC
print('AUC:', auc)
# Calculate model RMSE
print(f'RMSE: {RMSE}')


Accuracy: 0.9168
Precision: 0.5416666666666666
Recall: 0.24074074074074073
F1-Score: 0.33333333333333337
AUC: 0.9094222287085685
RMSE: 0.2884441020371191


In [29]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time

# Measure time for Grid Search
start_time_grid = time.time()
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21], 'weights': ['uniform', 'distance']}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
end_time_grid = time.time()
time_grid_search = end_time_grid - start_time_grid

# Get the best hyperparameters
best_params_grid = grid_search.best_params_

# Train a new model with the best hyperparameters
model_grid = KNeighborsClassifier(n_neighbors=best_params_grid['n_neighbors'], weights=best_params_grid['weights'])
model_grid.fit(X_train, y_train)
predictions_grid = model_grid.predict(X_test)

# Model evaluation with Grid Search hyperparameter tuning
print("\nWith Grid Search Hyperparameter Tuning:")
print("Best Hyperparameters:", best_params_grid)
print("Accuracy:", accuracy_score(y_test, predictions_grid))
print("Precision:", precision_score(y_test, predictions_grid))
print("Recall:", recall_score(y_test, predictions_grid))
print("F1-Score:", f1_score(y_test, predictions_grid))
print("Area Under Curve:", auc)
print("Grid Search Time:", time_grid_search, "seconds")

# Measure time for Randomized Search
start_time_random = time.time()
param_dist = {'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance']}
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)
end_time_random = time.time()
time_random_search = end_time_random - start_time_random

# Get the best hyperparameters
best_params_random = random_search.best_params_

# Train a new model with the best hyperparameters
model_random = KNeighborsClassifier(n_neighbors=best_params_random['n_neighbors'], weights=best_params_random['weights'])
model_random.fit(X_train, y_train)
predictions_random = model_random.predict(X_test)



# Model evaluation with Randomized Search hyperparameter tuning
print("\nWith Randomized Search Hyperparameter Tuning:")
print("Best Hyperparameters:", best_params_random)
print("Accuracy:", accuracy_score(y_test, predictions_random))
print("Precision:", precision_score(y_test, predictions_random))
print("Recall:", recall_score(y_test, predictions_random))
print("F1-Score:", f1_score(y_test, predictions_random))
print("Area Under Curve:", auc)
print("Randomized Search Time:", time_random_search, "seconds")



With Grid Search Hyperparameter Tuning:
Best Hyperparameters: {'n_neighbors': 13, 'weights': 'distance'}
Accuracy: 0.9176
Precision: 0.5490196078431373
Recall: 0.25925925925925924
F1-Score: 0.3522012578616352
Area Under Curve: 0.9094222287085685
Grid Search Time: 2.1452815532684326 seconds

With Randomized Search Hyperparameter Tuning:
Best Hyperparameters: {'weights': 'distance', 'n_neighbors': 29}
Accuracy: 0.9232
Precision: 0.6428571428571429
Recall: 0.25
F1-Score: 0.36
Area Under Curve: 0.9094222287085685
Randomized Search Time: 1.087071418762207 seconds
