In [1]:
!pip install imblearn



In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [3]:
# Load dataset
data = pd.read_csv("Bank-term-deposit.csv")

In [4]:
data

Unnamed: 0,Age,Job,Marital,Education,Default,housing,Loan,Contact,Month,day_of_week,...,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [5]:
### Data Exploration
print(data.info())
print(data['y'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           41188 non-null  int64  
 1   Job           41188 non-null  object 
 2   Marital       41188 non-null  object 
 3   Education     41188 non-null  object 
 4   Default       41188 non-null  object 
 5   housing       41188 non-null  object 
 6   Loan          41188 non-null  object 
 7   Contact       41188 non-null  object 
 8   Month         41188 non-null  object 
 9   day_of_week   41188 non-null  object 
 10  duration      41188 non-null  int64  
 11  campaign      41188 non-null  int64  
 12  pdays         41188 non-null  int64  
 13  previous      41188 non-null  int64  
 14  poutcome      41188 non-null  object 
 15  empvarrate    41188 non-null  float64
 16  conspriceidx  41188 non-null  float64
 17  consconfidx   41188 non-null  float64
 18  euribor3m     41188 non-nu

In [6]:
# Convert categorical variables manually
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

In [7]:
data

Unnamed: 0,Age,Job,Marital,Education,Default,housing,Loan,Contact,Month,day_of_week,...,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y
0,56,3,1,0,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,5,1,5,0,2,0,0,7,0,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1
41184,46,1,1,5,0,0,0,0,7,0,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0
41185,56,5,1,6,0,2,0,0,7,0,...,2,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0
41186,44,9,1,5,0,0,0,0,7,0,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1


In [8]:
# Define features (X) and target (y)
X = data.drop('y', axis=1)
y = data['y']

In [9]:
### Handle Imbalance in Dataset
# Check class distribution
print("Class distribution before balancing:")
print(y.value_counts())

Class distribution before balancing:
y
0    36548
1     4640
Name: count, dtype: int64


In [30]:
(4640/41188)*100

11.265417111780131

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [32]:
y_train.value_counts()

y
0    29238
1     3712
Name: count, dtype: int64

In [39]:
# random forest, decision tree, k-means, hyperparameter tuning 

In [13]:
# # Undersampling
# class_0 = X_train[y_train == 0]
# class_1 = X_train[y_train == 1]

# class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)
# X_train_undersampled = pd.concat([class_0_downsampled, class_1])
# y_train_undersampled = [0] * len(class_0_downsampled) + [1] * len(class_1)

# Undersampling using sklearn
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

In [33]:
y_train_undersampled.value_counts()

y
0    3712
1    3712
Name: count, dtype: int64

In [14]:
# # Oversampling
# class_1_upsampled = resample(class_1, replace=True, n_samples=len(class_0), random_state=42)
# X_train_oversampled = pd.concat([class_0, class_1_upsampled])
# y_train_oversampled = [0] * len(class_0) + [1] * len(class_1_upsampled)

oversampler = RandomOverSampler(random_state=42)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

In [34]:
y_train_oversampled.value_counts()

y
0    29238
1    29238
Name: count, dtype: int64

In [15]:
# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [35]:
y_train_smote.value_counts()

y
0    29238
1    29238
Name: count, dtype: int64

In [18]:
### K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5) #value of k

In [20]:
# Perform Stratified K-Fold Cross-Validation for KNN
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1s = [], [], [], []

In [21]:
for train_index, val_index in skf.split(X_train_smote, y_train_smote): #3 folds, loop 3 times
    X_train_fold, X_val_fold = X_train_smote.iloc[train_index], X_train_smote.iloc[val_index]
    y_train_fold, y_val_fold = y_train_smote.iloc[train_index], y_train_smote.iloc[val_index]

    knn.fit(X_train_fold, y_train_fold) #model train
    y_pred = knn.predict(X_val_fold)

    accuracies.append(accuracy_score(y_val_fold, y_pred))
    precisions.append(precision_score(y_val_fold, y_pred))
    recalls.append(recall_score(y_val_fold, y_pred))
    f1s.append(f1_score(y_val_fold, y_pred))

In [22]:
print("\nKNN Results:")
print(f"Accuracy: {np.mean(accuracies):.4f} +/- {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} +/- {np.std(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f} +/- {np.std(recalls):.4f}")
print(f"F1 Score: {np.mean(f1s):.4f} +/- {np.std(f1s):.4f}")


KNN Results:
Accuracy: 0.9162 +/- 0.0018
Precision: 0.8650 +/- 0.0032
Recall: 0.9863 +/- 0.0010
F1 Score: 0.9217 +/- 0.0015


In [23]:
### Support Vector Machine (SVM)
svm = SVC(kernel='rbf', probability=True)

# Perform Stratified K-Fold Cross-Validation for SVM
accuracies, precisions, recalls, f1s = [], [], [], []

In [24]:
for train_index, val_index in skf.split(X_train_smote, y_train_smote):
    X_train_fold, X_val_fold = X_train_smote.iloc[train_index], X_train_smote.iloc[val_index]
    y_train_fold, y_val_fold = y_train_smote.iloc[train_index], y_train_smote.iloc[val_index]

    svm.fit(X_train_fold, y_train_fold)
    y_pred = svm.predict(X_val_fold)

    accuracies.append(accuracy_score(y_val_fold, y_pred))
    precisions.append(precision_score(y_val_fold, y_pred))
    recalls.append(recall_score(y_val_fold, y_pred))
    f1s.append(f1_score(y_val_fold, y_pred))

In [25]:
print("\nSVM Results:")
print(f"Accuracy: {np.mean(accuracies):.4f} +/- {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} +/- {np.std(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f} +/- {np.std(recalls):.4f}")
print(f"F1 Score: {np.mean(f1s):.4f} +/- {np.std(f1s):.4f}")


SVM Results:
Accuracy: 0.8439 +/- 0.0016
Precision: 0.8412 +/- 0.0013
Recall: 0.8479 +/- 0.0024
F1 Score: 0.8445 +/- 0.0017


In [26]:
### Naive Bayes
nb = GaussianNB()

# Perform Stratified K-Fold Cross-Validation for Naive Bayes
accuracies, precisions, recalls, f1s = [], [], [], []

In [27]:
for train_index, val_index in skf.split(X_train_smote, y_train_smote):
    X_train_fold, X_val_fold = X_train_smote.iloc[train_index], X_train_smote.iloc[val_index]
    y_train_fold, y_val_fold = y_train_smote.iloc[train_index], y_train_smote.iloc[val_index]

    nb.fit(X_train_fold, y_train_fold)
    y_pred = nb.predict(X_val_fold)

    accuracies.append(accuracy_score(y_val_fold, y_pred))
    precisions.append(precision_score(y_val_fold, y_pred))
    recalls.append(recall_score(y_val_fold, y_pred))
    f1s.append(f1_score(y_val_fold, y_pred))

In [28]:
print("\nNaive Bayes Results:")
print(f"Accuracy: {np.mean(accuracies):.4f} +/- {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} +/- {np.std(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f} +/- {np.std(recalls):.4f}")
print(f"F1 Score: {np.mean(f1s):.4f} +/- {np.std(f1s):.4f}")


Naive Bayes Results:
Accuracy: 0.7890 +/- 0.0046
Precision: 0.7868 +/- 0.0030
Recall: 0.7929 +/- 0.0073
F1 Score: 0.7898 +/- 0.0051


In [29]:
### Final Prediction and Evaluation on Test Set using SMOTE-balanced data
final_model = SVC(kernel='rbf', probability=True)  # Example: SVM as the final model
final_model.fit(X_train_smote, y_train_smote)
y_pred = final_model.predict(X_test)

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))


Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.98      0.84      0.90      7310
           1       0.40      0.88      0.55       928

    accuracy                           0.84      8238
   macro avg       0.69      0.86      0.73      8238
weighted avg       0.92      0.84      0.86      8238



In [36]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[6114 1196]
 [ 115  813]]


In [37]:
# 29000 -- 0 
# 3000 -- 1

# SMOTE

# 29000 -- 0
# 29000 -- 1 <not reliable> 