In [186]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [187]:
df = pd.read_csv("dataset/bank-full.csv", delimiter = ";")

In [188]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [190]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [191]:
df['y'].unique()

array(['no', 'yes'], dtype=object)

In [192]:
df[df.duplicated()==True]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y


In [193]:
# # Label encoding
# unique = df['Species'].unique()
# to_int = {value: i for i , value in enumerate(unique)}
# df['Species'] = df['Species'].map(to_int)

In [194]:
for col in df.columns:
    if df[col].dtype == 'object' or col == 'y':
        unique = df[col].unique()
        to_int = {value: i for i , value in enumerate(unique)}
        df[col] = df[col].map(to_int)

In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        45211 non-null  int64
 1   job        45211 non-null  int64
 2   marital    45211 non-null  int64
 3   education  45211 non-null  int64
 4   default    45211 non-null  int64
 5   balance    45211 non-null  int64
 6   housing    45211 non-null  int64
 7   loan       45211 non-null  int64
 8   contact    45211 non-null  int64
 9   day        45211 non-null  int64
 10  month      45211 non-null  int64
 11  duration   45211 non-null  int64
 12  campaign   45211 non-null  int64
 13  pdays      45211 non-null  int64
 14  previous   45211 non-null  int64
 15  poutcome   45211 non-null  int64
 16  y          45211 non-null  int64
dtypes: int64(17)
memory usage: 5.9 MB


In [196]:
# Features and target
X = df.iloc[:, :16]
y = df.iloc[:, 16:]

In [197]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,0,0,0,0,2143,0,0,0,5,0,261,1,-1,0,0
1,44,1,1,1,0,29,0,0,0,5,0,151,1,-1,0,0
2,33,2,0,1,0,2,0,1,0,5,0,76,1,-1,0,0
3,47,3,0,2,0,1506,0,0,0,5,0,92,1,-1,0,0
4,33,4,1,2,0,1,1,0,0,5,0,198,1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,1,0,0,0,825,1,0,1,17,5,977,3,-1,0,0
45207,71,5,2,3,0,1729,1,0,1,17,5,456,2,-1,0,0
45208,72,5,0,1,0,5715,1,0,1,17,5,1127,5,184,3,3
45209,57,3,0,1,0,668,1,0,2,17,5,508,4,-1,0,0


In [198]:
y

Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0
...,...
45206,1
45207,1
45208,1
45209,0


In [199]:
col_list = list(df.columns)
print(col_list)

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [200]:
# Normalize features
X = (X - X.mean(axis=0)) / X.std(axis=0)

In [201]:
def calculate_vif(df):
    vif_dict = {}
    features = df.columns
    
    for feature in features:
        # y = current feature to predict
        y = df[feature].values
        
        # X = all other features
        X = df.loc[:, df.columns != feature].values
        
        # Add intercept term (column of ones) for regression
        X = np.column_stack((np.ones(X.shape[0]), X))
        
        # Perform linear regression using Normal Equation: beta = (X'X)^-1 X'y
        beta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        
        # Predict y_hat
        y_hat = X.dot(beta)
        
        # Compute R^2: 1 - (SS_res / SS_tot)
        ss_res = np.sum((y - y_hat) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        r_squared = 1 - ss_res / ss_tot
        
        # Calculate VIF
        vif = 1 / (1 - r_squared)
        vif_dict[feature] = vif
        
    return vif_dict

In [202]:
vif_values = calculate_vif(df)
vif_values

{'age': 1.1165785637347103,
 'job': 1.0862803650004025,
 'marital': 1.0314160113463304,
 'education': 1.1586976973572416,
 'default': 1.0123955767225707,
 'balance': 1.031893604155388,
 'housing': 1.1631903948463436,
 'loan': 1.019956234248778,
 'contact': 1.329983182750005,
 'day': 1.0388164986344597,
 'month': 1.3079771778503297,
 'duration': 1.2067969026436467,
 'campaign': 1.0551964809941567,
 'pdays': 2.2056666922731742,
 'previous': 1.354381132567045,
 'poutcome': 2.3735529227525953,
 'y': 1.345878662231827}

In [203]:
X = X.values
y = y.values

In [204]:
for x in col_list:
  if df[x].dtypes=='object' or x =='y':
    continue

  Q1 = df[x].quantile(0.25)
  Q3 = df[x].quantile(0.75)

  IQR = Q3-Q1   # Inter Quartile Range.
  lower_bound = Q1 - 1.5*IQR
  upper_bound = Q3 + 1.5*IQR
  df = df[(df[x]>=lower_bound) & (df[x]<=upper_bound)]

In [205]:
def train_test_split(X, y, test_size=0.3):
    split_index = int((1 - test_size) * len(X))
    return X[:split_index], X[split_index:], y[:split_index], y[split_index:]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [206]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


In [207]:
def compute_loss(y_true, y_pred):
    # Avoid log(0) by clipping
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

In [208]:
def train_logistic_regression(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    weights = np.zeros((n, 1))
    bias = 0
    losses = []

    for _ in range(epochs):
        z = np.dot(X, weights) + bias
        y_pred = sigmoid(z)

        loss = -np.mean(y * np.log(y_pred + 1e-8) + (1 - y) * np.log(1 - y_pred + 1e-8))
        losses.append(loss)

        dw = np.dot(X.T, (y_pred - y)) / m
        db = np.sum(y_pred - y) / m

        weights -= lr * dw
        bias -= lr * db

    return weights, bias, losses


In [209]:
# Train
weights, bias, losses = train_logistic_regression(X_train, y_train)

In [210]:
def predict(X, weights, bias):
    z = np.dot(X, weights) + bias
    return (sigmoid(z) >= 0.5).astype(int)


In [211]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def precision(y_true, y_pred):
    tp = np.sum((y_pred == 1) & (y_true == 1))
    fp = np.sum((y_pred == 1) & (y_true == 0))
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def recall(y_true, y_pred):
    tp = np.sum((y_pred == 1) & (y_true == 1))
    fn = np.sum((y_pred == 0) & (y_true == 1))
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * p * r / (p + r) if (p + r) > 0 else 0

In [212]:
y_pred = predict(X_test, weights, bias)

In [213]:
print("Accuracy:", accuracy(y_test, y_pred))
print("Precision:", precision(y_test, y_pred))
print("Recall:", recall(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.764376290179888
Precision: 0.5973826020015397
Recall: 0.22499275152218035
F1 Score: 0.3268744734625106
