In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier  # Importing XGBClassifier
import lightgbm as lgb  # Importing LightGBM

import warnings

warnings.filterwarnings('ignore')

In [48]:
df = pd.read_csv("../dataset/updated_bank.csv");df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,y,contact_cellular,contact_telephone,contact_unknown,duration_contact_cellular,duration_contact_telephone,duration_contact_unknown,above_median_duration
0,30,unemployed,married,primary,no,1787.0,no,no,19,oct,79,1,no,True,False,False,79,0,0,0
1,30,management,married,tertiary,no,1476.0,yes,yes,3,jun,199,4,no,False,False,True,0,0,199,1
2,59,blue-collar,married,secondary,no,0.0,yes,no,5,may,226,1,no,False,False,True,0,0,226,1
3,39,technician,married,secondary,no,147.0,yes,no,6,may,151,2,no,True,False,False,151,0,0,0
4,41,entrepreneur,married,tertiary,no,221.0,yes,no,14,may,57,2,no,False,False,True,0,0,57,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2398,33,services,married,secondary,no,288.0,yes,no,17,apr,306,3,no,True,False,False,306,0,0,1
2399,42,admin.,married,unknown,no,642.0,yes,yes,16,may,509,2,no,False,False,True,0,0,509,1
2400,36,technician,divorced,secondary,no,566.0,yes,no,20,may,129,2,no,False,False,True,0,0,129,0
2401,49,blue-collar,married,secondary,no,322.0,no,no,14,aug,356,2,no,True,False,False,356,0,0,1


In [49]:
X = df.drop("y", axis=1);
y = df["y"]

label_enc = LabelEncoder()

for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_enc.fit_transform(X[column])
    
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [50]:
# Model 1: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


conf_matrix_random_forest = confusion_matrix(y_test, y_pred_rf);conf_matrix_random_forest

array([[683,   4],
       [ 33,   1]], dtype=int64)

In [51]:
report_random_forest = classification_report(y_test, y_pred_rf)

print(report_random_forest)

              precision    recall  f1-score   support

          no       0.95      0.99      0.97       687
         yes       0.20      0.03      0.05        34

    accuracy                           0.95       721
   macro avg       0.58      0.51      0.51       721
weighted avg       0.92      0.95      0.93       721



In [52]:
# Model 2: Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

conf_matrix_gradient_boosting = confusion_matrix(y_test, y_pred_gb);conf_matrix_gradient_boosting

array([[680,   7],
       [ 31,   3]], dtype=int64)

In [53]:
report_gradient_boosting = classification_report(y_test, y_pred_gb)

print(report_gradient_boosting)

              precision    recall  f1-score   support

          no       0.96      0.99      0.97       687
         yes       0.30      0.09      0.14        34

    accuracy                           0.95       721
   macro avg       0.63      0.54      0.55       721
weighted avg       0.93      0.95      0.93       721



In [54]:
# Model 3: Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

conf_matrix_decision_tree = confusion_matrix(y_test, y_pred_dt)
report_decision_tree = classification_report(y_test, y_pred_dt)

print("Decision Tree Classifier Report:")
print(report_decision_tree)


Decision Tree Classifier Report:
              precision    recall  f1-score   support

          no       0.96      0.95      0.96       687
         yes       0.22      0.26      0.24        34

    accuracy                           0.92       721
   macro avg       0.59      0.61      0.60       721
weighted avg       0.93      0.92      0.92       721



 # Ensemble

In [55]:
# Create a Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model)
    ],
    voting='hard'  # Use 'soft' for probabilistic voting if desired
)

# Fit the Voting Classifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the Voting Classifier
conf_matrix_voting = confusion_matrix(y_test, y_pred_voting)
report_voting = classification_report(y_test, y_pred_voting)

print("Voting Classifier Report:")
print(report_voting)


Voting Classifier Report:
              precision    recall  f1-score   support

          no       0.96      0.99      0.97       687
         yes       0.25      0.06      0.10        34

    accuracy                           0.95       721
   macro avg       0.60      0.53      0.53       721
weighted avg       0.92      0.95      0.93       721



In [56]:

# Create a Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model)
    ],
    final_estimator=LogisticRegression()  # Meta-learner
)

# Fit the Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Make predictions
y_pred_stacking = stacking_clf.predict(X_test)

# Evaluate the Stacking Classifier
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
report_stacking = classification_report(y_test, y_pred_stacking)

print("Stacking Classifier Report:")
print(report_stacking)


Stacking Classifier Report:
              precision    recall  f1-score   support

          no       0.96      0.99      0.97       687
         yes       0.25      0.06      0.10        34

    accuracy                           0.95       721
   macro avg       0.60      0.53      0.53       721
weighted avg       0.92      0.95      0.93       721



In [19]:
# Load the data
df = pd.read_csv("../dataset/updated_bank.csv")
X = df.drop("y", axis=1)
y = df["y"]

# Label Encoding for categorical features
label_enc = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_enc.fit_transform(X[column])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle class imbalance using SMOTE combined with ENN
smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

# Initialize models with class weights
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
gb_model = GradientBoostingClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')
log_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Voting Classifier as an alternative ensemble method
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model),
        ('log', log_model)
    ],
    voting='soft'  # Using soft voting to improve performance
)

# Fit the Voting Classifier
voting_clf.fit(X_train_res, y_train_res)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the Voting Classifier
print("Voting Classifier Report:")
print(classification_report(y_test, y_pred_voting))




Voting Classifier Report:
              precision    recall  f1-score   support

          no       0.97      0.88      0.93       687
         yes       0.18      0.53      0.27        34

    accuracy                           0.87       721
   macro avg       0.58      0.71      0.60       721
weighted avg       0.94      0.87      0.89       721



In [22]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

# Convert boolean columns to integer (0 and 1)
X['contact_cellular'] = X['contact_cellular'].astype(int)
X['contact_telephone'] = X['contact_telephone'].astype(int)
X['contact_unknown'] = X['contact_unknown'].astype(int)

# Label Encoding for categorical features in X (features)
label_enc = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_enc.fit_transform(X[column])

# Label encode the target variable y, if it's categorical (e.g., 'yes', 'no')
if y.dtype == 'object':
    y = label_enc.fit_transform(y)

# Split the dataset after ensuring it's numeric
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Handle class imbalance using SMOTE combined with ENN
smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

# Initialize the XGBoost model with scale_pos_weight to handle class imbalance
pos_class_weight = (len(y_train_res) - sum(y_train_res)) / sum(y_train_res)
xgb_model = XGBClassifier(random_state=42, scale_pos_weight=pos_class_weight)

# Fit the XGBoost model
xgb_model.fit(X_train_res, y_train_res)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))


XGBoost Classifier Report:
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       687
           1       0.17      0.44      0.24        34

    accuracy                           0.87       721
   macro avg       0.57      0.67      0.59       721
weighted avg       0.93      0.87      0.90       721



In [16]:
import lightgbm as lgb
from sklearn.metrics import classification_report

# Initialize the LightGBM model with class_weight='balanced'
lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced')

# Fit the LightGBM model
lgb_model.fit(X_train_res, y_train_res)

# Make predictions
y_pred_lgb = lgb_model.predict(X_test)

# Evaluate the model
print("LightGBM Classifier Report:")
print(classification_report(y_test, y_pred_lgb))


[LightGBM] [Info] Number of positive: 1595, number of negative: 1287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001988 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3256
[LightGBM] [Info] Number of data points in the train set: 2882, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Classifier Report:
              precision    recall  f1-score   support

          no       0.97      0.92      0.95       687
         yes       0.20      0.38      0.26        34

    accuracy                           0.90       721
   macro avg       0.58      0.65      0.60       721
weighted avg       0.93      0.90      0.91       721



In [23]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Initialize the CatBoost model with auto_class_weights
cat_model = CatBoostClassifier(random_state=42, auto_class_weights='Balanced', verbose=0)

# Fit the CatBoost model
cat_model.fit(X_train_res, y_train_res)

# Make predictions
y_pred_cat = cat_model.predict(X_test)

# Evaluate the model
print("CatBoost Classifier Report:")
print(classification_report(y_test, y_pred_cat))


CatBoost Classifier Report:
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       687
           1       0.19      0.50      0.27        34

    accuracy                           0.87       721
   macro avg       0.58      0.70      0.60       721
weighted avg       0.94      0.87      0.90       721



# Hyperparameter Tuning

In [58]:
# Define parameter grid for each model
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20],
    'dt__max_depth': [None, 10, 20],
    'log__C': [0.01, 0.1, 1, 10],
}

# Create the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model),
        ('log', log_model)
    ],
    voting='soft'
)

# Set up GridSearchCV
grid_search = GridSearchCV(voting_clf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Evaluate the best estimator
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Print classification report for the best model
print("Best Voting Classifier Report:")
print(classification_report(y_test, y_pred_best))


Best parameters found:  {'dt__max_depth': None, 'log__C': 0.01, 'rf__max_depth': None, 'rf__n_estimators': 50}
Best cross-validation score:  0.9749884947865377
Best Voting Classifier Report:
              precision    recall  f1-score   support

          no       0.97      0.89      0.93       687
         yes       0.19      0.53      0.28        34

    accuracy                           0.87       721
   macro avg       0.58      0.71      0.60       721
weighted avg       0.94      0.87      0.90       721



## Conclusion
### The model performs well in predicting the no class but struggles significantly with the yes class. The low precision and recall for yes suggest that improvements are needed to enhance the model’s ability to identify the minority class effectively.

In [59]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Oversampling with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Define a more comprehensive parameter grid
param_grid = {
    'rf__n_estimators': [50, 100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30],
    'dt__max_depth': [None, 10, 20, 30],
    'log__C': [0.001, 0.01, 0.1, 1, 10],
    'gb__n_estimators': [100, 200],
}

# Voting classifier with weighted estimators
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model),
        ('log', log_model)
    ],
    voting='soft',
    weights=[1, 2, 1, 1]  # Adjust weights based on prior performance
)

# Set up GridSearchCV with a broader grid
grid_search = GridSearchCV(voting_clf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

# Check results
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


Best parameters found:  {'dt__max_depth': 20, 'gb__n_estimators': 200, 'log__C': 0.001, 'rf__max_depth': 20, 'rf__n_estimators': 100}
Best cross-validation score:  0.9712791032992678
