In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
df1=pd.read_csv("/content/bank_data.csv")
df1.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [7]:
df1.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

Let us remove the columns that feels irrelevant in classifying

In [9]:
df2=df1.drop(["default","contact","poutcome"],axis=1)
df2.head()

Unnamed: 0,age,job,marital,education,balance,housing,loan,day,month,duration,campaign,pdays,previous,y
0,58,management,married,tertiary,2143,yes,no,5,may,261,1,-1,0,no
1,44,technician,single,secondary,29,yes,no,5,may,151,1,-1,0,no
2,33,entrepreneur,married,secondary,2,yes,yes,5,may,76,1,-1,0,no
3,47,blue-collar,married,unknown,1506,yes,no,5,may,92,1,-1,0,no
4,33,unknown,single,unknown,1,no,no,5,may,198,1,-1,0,no


In [10]:
df2.shape

(45211, 14)

In [15]:
df2.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
loan         0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
y            0
dtype: int64

No empty values , nice sign

Lets check unique values in our dataframe to apply encoding

In [17]:
df2.nunique()

age            77
job            12
marital         3
education       4
balance      7168
housing         2
loan            2
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
y               2
dtype: int64

It can be seen that features like job , marital , education , housing ,loan need encoding

In [19]:
# One-hot encoding
df3 = pd.get_dummies(df2, columns=["job","marital","education","housing","loan"], drop_first=False)

df3.head()

Unnamed: 0,age,balance,day,month,duration,campaign,pdays,previous,y,job_admin.,...,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,housing_no,housing_yes,loan_no,loan_yes
0,58,2143,5,may,261,1,-1,0,no,False,...,True,False,False,False,True,False,False,True,True,False
1,44,29,5,may,151,1,-1,0,no,False,...,False,True,False,True,False,False,False,True,True,False
2,33,2,5,may,76,1,-1,0,no,False,...,True,False,False,True,False,False,False,True,False,True
3,47,1506,5,may,92,1,-1,0,no,False,...,True,False,False,False,False,True,False,True,True,False
4,33,1,5,may,198,1,-1,0,no,False,...,False,True,False,False,False,True,True,False,True,False


In [20]:
df4=df3.drop(["day","month"],axis=1)
df4.head()

Unnamed: 0,age,balance,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,...,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,housing_no,housing_yes,loan_no,loan_yes
0,58,2143,261,1,-1,0,no,False,False,False,...,True,False,False,False,True,False,False,True,True,False
1,44,29,151,1,-1,0,no,False,False,False,...,False,True,False,True,False,False,False,True,True,False
2,33,2,76,1,-1,0,no,False,False,True,...,True,False,False,True,False,False,False,True,False,True
3,47,1506,92,1,-1,0,no,False,True,False,...,True,False,False,False,False,True,False,True,True,False
4,33,1,198,1,-1,0,no,False,False,False,...,False,True,False,False,False,True,True,False,True,False


In [37]:
df4['y'] = df4['y'].map({'yes': 1, 'no': 0})

In [38]:
X=df4.drop("y",axis=1)
y=df4["y"]

X = X.astype(int)
X.head()

Unnamed: 0,age,balance,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,housing_no,housing_yes,loan_no,loan_yes
0,58,2143,261,1,-1,0,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0
1,44,29,151,1,-1,0,0,0,0,0,...,0,1,0,1,0,0,0,1,1,0
2,33,2,76,1,-1,0,0,0,1,0,...,1,0,0,1,0,0,0,1,0,1
3,47,1506,92,1,-1,0,0,1,0,0,...,1,0,0,0,0,1,0,1,1,0
4,33,1,198,1,-1,0,0,0,0,0,...,0,1,0,0,0,1,1,0,1,0


In [39]:
print(X.shape)
print(y.shape)

(45211, 29)
(45211,)


In [40]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

**MODEL** **TRAINING**

In [41]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2 , random_state=30)

In [42]:
#import logistic regression
from sklearn.linear_model import LogisticRegression

#Fit Logistic Regression
lr=LogisticRegression()lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Lets check the accuracy of our model.

In [45]:
from os import access
predict=lr.predict(X_test)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
print("The mean squared error of model is",mean_squared_error(y_test,predict))
print("The Accuracy of model is",accuracy_score(y_test,predict))

The mean squared error of model is 0.10980869180581665
The Accuracy of model is 0.8901913081941834


In [44]:

from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test,predict),columns=['Predicted NO','Predicted YES'], index=['Actual NO','Actual YES'])

Unnamed: 0,Predicted NO,Predicted YES
Actual NO,7831,185
Actual YES,808,219


Lets improve the performance of the model by scaling our input data.

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.2 , random_state=30)

In [66]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)

In [67]:
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [68]:
X_train_scaled

array([[ 1.417799  , -0.08375482,  2.66871589, ..., -1.11945396,
         0.43823551, -0.43823551],
       [-1.2161168 , -0.55531186,  0.23265817, ...,  0.89329265,
         0.43823551, -0.43823551],
       [ 0.66525163, -0.23525646,  0.1238709 , ...,  0.89329265,
         0.43823551, -0.43823551],
       ...,
       [-0.08729574,  1.65195549, -0.19472038, ..., -1.11945396,
         0.43823551, -0.43823551],
       [-0.74577469, -0.39430038, -0.47834433, ...,  0.89329265,
         0.43823551, -0.43823551],
       [-0.18136417,  3.43947805, -0.94457547, ...,  0.89329265,
         0.43823551, -0.43823551]])

In [69]:
#import logistic regression
from sklearn.linear_model import LogisticRegression

#Fit Logistic Regression
lr_scaled=LogisticRegression()
lr_scaled.fit(X_train_scaled,y_train)

In [70]:
predict=lr_scaled.predict(X_test_scaled)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
print("The mean squared error of model is",mean_squared_error(y_test,predict))
print("The Accuracy of model is",accuracy_score(y_test,predict))

The mean squared error of model is 0.10726528806811898
The Accuracy of model is 0.892734711931881


Improved the performance by 0.2% using scaling.

Let us try using Neural Networks for more accuracy

In [71]:
#Lets save our model as tflite file
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


In [72]:
# Build the logistic regression model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a3029ea51b0>

In [74]:
# Predict on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)



In [76]:
from sklearn.metrics import f1_score
# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred_prob)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"F1 Score: {f1}")
print(f"Mean Squared Error: {mse}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.2420091324200913
Mean Squared Error: 0.08770127881212014
Accuracy: 0.8898595598805706


In [77]:
# Convert the model to a TensorFlow Lite model
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the TensorFlow Lite model to a file
with open('logistic_regression_model.tflite', 'wb') as f:
    f.write(tflite_model)