In [25]:
# Import pandas 
import pandas as pd
import warnings

# Load the dataset
df = pd.read_excel('C:\\Users\\a1787\\Documents\\00term4\\1(4880)IntroductiontoDataAnalysis\\assignment2\\Bank_Personal_Loan_Modelling.xlsx', sheet_name='Data')

# See the top-5 records in the data
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [32]:
# Separate features and target variable
X = df.copy()       # Create separate copy to prevent unwanted tampering of data.
del X['Personal Loan']     # Delete target variable.
del X['ID']
del X['ZIP Code']
del X['Age']
del X['Experience']

# Target variable
y = df['Personal Loan']
# print(X.describe())

# Replacing negative values with 0
X[X < 0] = 0

X['IncomeBin']   = pd.cut(x=df['Income'], bins=[0,116, 225])
X = pd.get_dummies(X, columns=['IncomeBin'])

print(X.describe())


            Income       Family        CCAvg    Education     Mortgage  Securities Account  CD Account       Online   CreditCard
count  5000.000000  5000.000000  5000.000000  5000.000000  5000.000000         5000.000000  5000.00000  5000.000000  5000.000000
mean     73.774200     2.396400     1.937913     1.881000    56.498800            0.104400     0.06040     0.596800     0.294000
std      46.033729     1.147663     1.747666     0.839869   101.713802            0.305809     0.23825     0.490589     0.455637
min       8.000000     1.000000     0.000000     1.000000     0.000000            0.000000     0.00000     0.000000     0.000000
25%      39.000000     1.000000     0.700000     1.000000     0.000000            0.000000     0.00000     0.000000     0.000000
50%      64.000000     2.000000     1.500000     2.000000     0.000000            0.000000     0.00000     1.000000     0.000000
75%      98.000000     3.000000     2.500000     3.000000   101.000000            0.000000     0.

In [33]:
# Import train_test_split function from 
from sklearn.model_selection import train_test_split
 
# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [34]:
# import logistic regression scikit-learn model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from   sklearn  import metrics
import numpy as np
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
 
# instantiate the model
model = LogisticRegression(solver='lbfgs')
 
# fit the model with data
model.fit(X_train,y_train)
 
# Forecast the target variable for given test dataset
y_pred = model.predict(X_test)
#print(abs(predictions - target_test))

predictions_prob = model.predict_proba(X_test)[::,1]
auc = roc_auc_score(y_test, predictions_prob)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Import metrics module for performance evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))
# Print auc value
print("Area Under Curve:",auc)
# Calculate model RMSE
print(f'RMSE: {RMSE}')

Accuracy: 0.9704
Precision: 0.9382716049382716
Recall: 0.7037037037037037
F1-Score: 0.8042328042328042
Area Under Curve: 0.9263394304987999
RMSE: 0.17204650534085253


In [35]:
import time
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# Measure time for Grid Search
start_time_grid = time.time()
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
end_time_grid = time.time()
time_grid_search = end_time_grid - start_time_grid

# Get the best hyperparameters
best_params_grid = grid_search.best_params_

# Train a new model with the best hyperparameters
model_grid = LogisticRegression(C=best_params_grid['C'], penalty=best_params_grid['penalty'], solver='liblinear')
model_grid.fit(X_train, y_train)
predictions_grid = model_grid.predict(X_test)

# Model evaluation with Grid Search hyperparameter tuning
print("\nWith Grid Search Hyperparameter Tuning:")
print("Best Hyperparameters:", best_params_grid)
print("Accuracy:", accuracy_score(y_test, predictions_grid))
print("Precision:", precision_score(y_test, predictions_grid))
print("Recall:", recall_score(y_test, predictions_grid))
print("F1-Score:", f1_score(y_test, predictions_grid))
print("Area Under Curve:", roc_auc_score(y_test, model_grid.predict_proba(X_test)[:, 1]))
print("Grid Search Time:", time_grid_search, "seconds")

# Measure time for Randomized Search
start_time_random = time.time()
param_dist = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']}
random_search = RandomizedSearchCV(LogisticRegression(solver='liblinear'), param_distributions=param_dist, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search.fit(X_train, y_train)
end_time_random = time.time()
time_random_search = end_time_random - start_time_random

# Get the best hyperparameters
best_params_random = random_search.best_params_

# Train a new model with the best hyperparameters
model_random = LogisticRegression(C=best_params_random['C'], penalty=best_params_random['penalty'], solver='liblinear')
model_random.fit(X_train, y_train)
predictions_random = model_random.predict(X_test)

# Model evaluation with Randomized Search hyperparameter tuning
print("\nWith Randomized Search Hyperparameter Tuning:")
print("Best Hyperparameters:", best_params_random)
print("Accuracy:", accuracy_score(y_test, predictions_random))
print("Precision:", precision_score(y_test, predictions_random))
print("Recall:", recall_score(y_test, predictions_random))
print("F1-Score:", f1_score(y_test, predictions_random))

# Get the predicted probabilities for the positive class
y_score_random = model_random.predict_proba(X_test)[:, 1]

# Calculate and print the Area Under Curve (AUC)
print("Area Under Curve:", roc_auc_score(y_test, y_score_random))
print("Randomized Search Time:", time_random_search, "seconds")



With Grid Search Hyperparameter Tuning:
Best Hyperparameters: {'C': 1000, 'penalty': 'l1'}
Accuracy: 0.9664
Precision: 0.9125
Recall: 0.6759259259259259
F1-Score: 0.7765957446808511
Area Under Curve: 0.9433823052474541
Grid Search Time: 1.8486275672912598 seconds

With Randomized Search Hyperparameter Tuning:
Best Hyperparameters: {'penalty': 'l1', 'C': 1000}
Accuracy: 0.9664
Precision: 0.9125
Recall: 0.6759259259259259
F1-Score: 0.7765957446808511
Area Under Curve: 0.943398521113057
Randomized Search Time: 1.4073362350463867 seconds
