In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
# Load the dataset
url = "/content/drive/MyDrive/Datasets/bank-data/bank-full.csv"
bank_data = pd.read_csv(url, delimiter=';')

In [52]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [53]:
# Dropping unnecessary features or those which has too many unknown value, that can affect out model
columns_to_drop = ['contact', 'month', 'default','poutcome']

bank_data = bank_data.drop(columns=columns_to_drop, axis=1)

In [54]:
bank_data['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [55]:
# Converting datatype into object to int for job feather
le = LabelEncoder()
bank_data['job'] = le.fit_transform(bank_data['job'])

bank_data['job'].unique()


array([ 4,  9,  2,  1, 11,  5,  0,  7,  6, 10,  3,  8])

In [56]:
print(bank_data.dtypes)

age           int64
job           int64
marital      object
education    object
balance       int64
housing      object
loan         object
day           int64
duration      int64
campaign      int64
pdays         int64
previous      int64
y            object
dtype: object


In [57]:
bank_data.shape

(45211, 13)

In [58]:
# bank_data = bank_data[bank_data['poutcome'] != 'unknown']
bank_data = bank_data[bank_data['education'] != 'unknown']

In [59]:
bank_data.shape

(43354, 13)

In [60]:
# Converting datatype into object to int for all categorical feathers.
for column in bank_data.columns:
    if bank_data[column].dtype == 'O':
        bank_data[column] = le.fit_transform(bank_data[column])

In [61]:
print(bank_data.dtypes)

age          int64
job          int64
marital      int64
education    int64
balance      int64
housing      int64
loan         int64
day          int64
duration     int64
campaign     int64
pdays        int64
previous     int64
y            int64
dtype: object


In [62]:
# Separate the tranning feature from targeted one
X = bank_data.drop("y", axis=1)
y = bank_data["y"]

In [63]:
# Normalize the dataset
for column in bank_data.columns[:-1]:
    bank_data[column] = (bank_data[column] - bank_data[column].mean()) / bank_data[column].std()


In [64]:
bank_data.head()

Unnamed: 0,age,job,marital,education,balance,housing,loan,day,duration,campaign,pdays,previous,y
0,1.636744,-0.099344,-0.273461,1.279349,0.259143,0.885078,-0.442943,-1.300937,0.010854,-0.574192,-0.412042,-0.250792,0
1,0.305817,1.435348,1.367684,-0.223602,-0.436271,0.885078,-0.442943,-1.300937,-0.415456,-0.574192,-0.412042,-0.250792,0
2,-0.739911,-0.713221,-0.273461,-0.223602,-0.445153,0.885078,2.257573,-1.300937,-0.706122,-0.574192,-0.412042,-0.250792,0
5,-0.549778,-0.099344,-0.273461,1.279349,-0.369822,0.885078,-0.442943,-1.300937,-0.461963,-0.574192,-0.412042,-0.250792,0
6,-1.215242,-0.099344,1.367684,1.279349,-0.298767,0.885078,2.257573,-1.300937,-0.15967,-0.574192,-0.412042,-0.250792,0


In [65]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [66]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34683 entries, 15635 to 16571
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        34683 non-null  int64
 1   job        34683 non-null  int64
 2   marital    34683 non-null  int64
 3   education  34683 non-null  int64
 4   balance    34683 non-null  int64
 5   housing    34683 non-null  int64
 6   loan       34683 non-null  int64
 7   day        34683 non-null  int64
 8   duration   34683 non-null  int64
 9   campaign   34683 non-null  int64
 10  pdays      34683 non-null  int64
 11  previous   34683 non-null  int64
dtypes: int64(12)
memory usage: 3.4 MB


In [67]:
# Define sigmoid and predict function..and i use matrix multiplication here
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict(X, weights):
    return sigmoid(np.dot(X, weights))

In [68]:
def logistic_regression(X, y, learning_rate, epochs):
    X = np.c_[np.ones(X.shape[0]), X]
    weights = np.zeros(X.shape[1])

    for epoch in range(epochs):
        predictions = predict(X, weights)
        errors = y - predictions

        gradient = np.dot(X.T, errors)
        weights += learning_rate * gradient

    return weights



In [69]:
# Tranning.....
learning_rate = 0.001
epochs = 10000
weights = logistic_regression(X_train, y_train, learning_rate, epochs)





  return 1 / (1 + np.exp(-z))


In [70]:
# Making prediction on the test data
X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]
y_pred_prob = predict(X_test_bias, weights)
y_pred = (y_pred_prob >= 0.5).astype(int)

  return 1 / (1 + np.exp(-z))


In [71]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.855380002306539
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92      7638
           1       0.31      0.18      0.22      1033

    accuracy                           0.86      8671
   macro avg       0.60      0.56      0.57      8671
weighted avg       0.83      0.86      0.84      8671

Confusion Matrix:
 [[7235  403]
 [ 851  182]]
