## Comparative Study of Machine Learning Models for Predicting Term Deposit Subscriptions

The bank dataset is a collection of customer data used to predict term deposit subscriptions in marketing campaigns, featuring demographics, financial details, and campaign-related attributes. Random Forest, SVM, Logistic Regression, and an ensemble of Random Forest and Logistic Regression were applied on the dataset. By employing hyperparameter tuning, class imbalance handling, and feature importance analysis, the models performancec were evaluated to balance predictive accuracy, interpretability, and suitability for financial applications, offering insights into customer behavior and improved decision-making strategies.

In [None]:
# Importing the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, classification_report, 
                        confusion_matrix, roc_auc_score, roc_curve, 
                        average_precision_score, precision_recall_curve)
from math import log
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


**EXPLORING THE DATASET

In [None]:
#Loading the bank dataset
data = pd.read_csv('bank.csv', delimiter=';')
data.head()

In [None]:
#A look into the dataset
data.info()

In [None]:
#Splitting the columns dataset into numerical and categorical columns bar the target column
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [None]:
#Variation in the target variable 
data['y'].value_counts(normalize=True)

In [None]:
#Visualising the target variable variation
sns.countplot(x='y', data=data)
plt.title('Variation in target variable classes')
plt.ylabel('Number of customers')
plt.xlabel('Subscribed')
plt.show()

In [None]:
#Visualising the relationship between the numerical features
sns.pairplot(data, hue='y')
plt.show()

In [None]:
#Visualising the dataset to see customers who subscribe by categories

plt.figure(figsize=(30, 100))
for i, col in enumerate(categorical_features):
    plt.subplot(len(categorical_features), 1, i+1)
    sns.countplot(data=data, x=col, hue='y')
    plt.title(f'Subscribers by {col} ', fontsize=50)
    plt.xlabel(col, fontsize=48)
    plt.ylabel('Frequency', fontsize=48)
    plt.xticks(fontsize=24)
    plt.yticks(fontsize=24)

plt.tight_layout()
plt.show()

**GENERAL DATA PREPROCESSING

In [None]:
#Converting months to integer representation
month_map = {'jan':1, 'feb':2, 'mar':3, 'apr': 4, 'may': 5, 'jun':6, 'jul': 7, 'aug':8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

data['month_int'] = pd.Categorical(data['month'], categories=month_map.keys(), ordered=True).codes + 1

data['month_int'].value_counts()

In [None]:
#Dropping the month column
data.drop('month', axis=1, inplace=True)
data.info()

In [None]:
#Generating a correlation heatmap for numerical columns
correlation_matrix = data.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

### RANDOM FOREST MODEL

**PREPROCESSING FOR RANDOM FOREST

In [None]:
#making a copy of the dataset for random_forest
rf_data = data.copy()
rf_data.head()

In [None]:
#One-hot encoding for columns with labels

rf_data_encoded = pd.get_dummies(rf_data, columns=['job', 'marital', 'education', 'contact', 'poutcome'], drop_first=True)
rf_data_encoded.head()

In [None]:
#Mapping binary columns into 0s and 1s

binary_columns = ['default', 'housing', 'loan', 'y']
rf_data_encoded[binary_columns] = rf_data_encoded[binary_columns].apply(lambda x: x.map({'yes': 1, 'no': 0}))
rf_data_encoded.head()

In [None]:
#Separating the rf_datset into features (X) and target (y)

X_rf = rf_data_encoded.drop('y', axis=1)
y_rf = rf_data_encoded['y']

In [None]:
#Splitting the rf_dataset into training and test data

X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf,y_rf, test_size=0.3, random_state=20, stratify=y_rf)

In [None]:
#Defining the model

rf_model = RandomForestClassifier(random_state=20, class_weight='balanced')
rf_model.fit(X_train_rf, y_train_rf)

In [None]:
#Random forest prediction
y_predict_rf = rf_model.predict(X_test_rf)
print(classification_report(y_test_rf, y_predict_rf))

In [None]:
#Confusion matrix for the random forest
rf_conf_matrix = confusion_matrix(y_test_rf, y_predict_rf)
sns.heatmap(rf_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Random Forest Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
#The probabilistic pridiction
y_rf_prob = rf_model.predict_proba(X_test_rf)[:, 1]

In [None]:
#ROC Curve for the random forest

roc_auc_rf = roc_auc_score(y_test_rf, y_rf_prob)
print(f"RF ROC-AUC Score: {roc_auc_rf}")

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test_rf, y_rf_prob)
plt.plot(fpr, tpr, label=f"RF ROC Curve (AUC = {roc_auc_rf:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random forest ROC Curve')
plt.legend()
plt.show()

In [None]:
#Extracting important features
importances_rf = rf_model.feature_importances_
feature_names_rf = X_rf.columns
importance_rf_df = pd.DataFrame({'Feature_rf': feature_names_rf, 'Importance_rf': importances_rf})
importance_rf_df = importance_rf_df.sort_values(by='Importance_rf', ascending=False)
importance_rf_df

In [None]:
#Visualising the important features
sns.barplot(x='Importance_rf', y='Feature_rf', data=importance_rf_df)
plt.title('Random Forest Feature Importance')
plt.show()

**Applying Hyperparameter tunning to Random Forest

In [None]:
#Defining the params
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    }

#The model
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=3, scoring='accuracy', verbose=2, n_jobs=1)
grid_search_rf.fit(X_train_rf, y_train_rf)

print(f"Best Parameters: {grid_search_rf.best_params_}")
print(f"Best Score: {grid_search_rf.best_score_}")


In [None]:
#Predicting with the hyperparameter model
y_rftune_predict = grid_search_rf.predict(X_test_rf)
print(classification_report(y_test_rf, y_rftune_predict))

In [None]:
#Confusion matrix for the random forest
rftuned_conf_matrix = confusion_matrix(y_test_rf, y_rftune_predict)
sns.heatmap(rftuned_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('CV Random Forest Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
#The tuned probabilistic pridiction
y_rftuned_prob = grid_search_rf.predict_proba(X_test_rf)[:, 1]

In [None]:
#ROC Curve for the hyper tunned random forest

roc_auc_rftuned = roc_auc_score(y_test_rf, y_rftuned_prob)
print(f"CV RF ROC-AUC Score: {roc_auc_rftuned}")

# Plot ROC Curve
fpr_rftune, tpr_rftune, _ = roc_curve(y_test_rf, y_rftuned_prob)
plt.plot(fpr_rftune, tpr_rftune, label=f"CV RF ROC Curve (AUC = {roc_auc_rftuned:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('CV Random forest ROC Curve')
plt.legend()
plt.show()

In [None]:
#Extracting CV RF important features

#Get the best estimator
best_estimator_rf = grid_search_rf.best_estimator_

#Access the feature importances
importances_cv_rf = best_estimator_rf.feature_importances_
feature_names_cv_rf = X_rf.columns
importance_cv_rf_df = pd.DataFrame({'Feature_cv_rf': feature_names_cv_rf, 'Importance_cv_rf': importances_cv_rf})
importance_cv_rf_df = importance_cv_rf_df.sort_values(by='Importance_cv_rf', ascending=False)
importance_cv_rf_df

In [None]:
#Visualising the important features
sns.barplot(x='Importance_cv_rf', y='Feature_cv_rf', data=importance_cv_rf_df)
plt.title('CV Random Forest Feature Importance')
plt.show()

### LOGISTIC REGRESSION MODEL

**PREPROCESSING FOR LOGISTIC REGRESSION

In [None]:
#Creating a copy of the dataset for logistic regression
lr_data = data.copy()
lr_data.head()

In [None]:
#Separating the lr dataset into feature (X) and target (y) 

#Feature columns
X_lr = lr_data.drop(columns='y')

#Extracting target column and converting Yes and No labels to binary 1s and 0s 
y_lr = lr_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

In [None]:
lr_numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'month_int']
lr_categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']

In [None]:
#Scalling numerical features and encoding categorical features

lr_numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])

lr_categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

lr_preprocessor = ColumnTransformer(transformers=[('num', lr_numerical_transformer, lr_numerical_features),
                                               ('cat', lr_categorical_transformer, lr_categorical_features)
                                               ])

In [None]:
#Splitting the lr dataset

X_lr_train, X_lr_test, y_lr_train, y_lr_test = train_test_split(X_lr, y_lr, test_size=0.3, random_state=20, stratify=y_lr)

In [None]:
#defining the model

logistic_model = LogisticRegression(class_weight= 'balanced', solver='liblinear', random_state=20)

In [None]:
#Using a pipeline

lr_pipeline = Pipeline(steps=[('preprocessor', lr_preprocessor), ('classifier', logistic_model)])

In [None]:
#fitting the model
lr_pipeline.fit(X_lr_train, y_lr_train)

In [None]:
#predicting with the lr pipeline model

y_lr_predict = lr_pipeline.predict(X_lr_test)

#Probabilities for ROC-AUC
y_lr_prob = lr_pipeline.predict_proba(X_lr_test)[:, 1]

print(classification_report(y_lr_test, y_lr_predict))

In [None]:
#lr confusion matrix
lr_conf_matrix = confusion_matrix(y_lr_test, y_lr_predict)
sns.heatmap(lr_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Logistic Regression Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
lr_roc_auc = roc_auc_score(y_lr_test, y_lr_prob)
print(f"LR ROC-AUC Score: {lr_roc_auc}")

# Plot ROC Curve
fpr_lr, tpr_lr, _ = roc_curve(y_lr_test, y_lr_prob)
plt.plot(fpr_lr, tpr_lr, label=f"LR ROC Curve (AUC = {lr_roc_auc:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.legend()
plt.show()

In [None]:
# Extract feature names after preprocessing 
# Get feature names from the numerical and categorical transformers 
lr_categorical_feature_new = lr_pipeline.named_steps['preprocessor'].\
    named_transformers_['cat'].get_feature_names_out(lr_categorical_features)

# Combine all feature names 
all_lr_feature_names = np.concatenate([lr_numerical_features, lr_categorical_feature_new])

In [None]:
#Extract coefficeints

lr_coefficients = lr_pipeline.named_steps['classifier'].coef_[0]


lr_feature_importance = pd.DataFrame({
    'Feature_lr': all_lr_feature_names,
    'Coefficient_lr': lr_coefficients,
    'Absolute Coefficient_lr': np.abs(lr_coefficients)
    })

lr_feature_importance = lr_feature_importance.sort_values(by='Absolute Coefficient_lr', ascending=False)
lr_feature_importance


In [None]:
#Visualising the important features
plt.figure(figsize=(12, 8))
sns.barplot(x='Absolute Coefficient_lr', y='Feature_lr', data=lr_feature_importance)
plt.title('Logistic Regression Feature Importance')
plt.show()

**Applying Hyperparameter tunning to the Logistic Regression

In [None]:
#defining the params
lr_param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2']
    }

In [None]:
#Performing the Grid Search CV

lr_grid_search = GridSearchCV(lr_pipeline, param_grid=lr_param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
lr_grid_search.fit(X_lr_train, y_lr_train)

In [None]:
#Best parameters and score

print(f"Best Parameters: {lr_grid_search.best_params_}")
print(f"Best Score: {lr_grid_search.best_score_}")

In [None]:
lr_best_model = lr_grid_search.best_estimator_
y_lr_best_predict = lr_best_model.predict(X_lr_test)
y_lr_best_prob = lr_best_model.predict_proba(X_lr_test)[:, 1]

print(classification_report(y_lr_test, y_lr_best_predict))

In [None]:
#lr tunned confusion matrix
lrtune_cm = confusion_matrix(y_lr_test, y_lr_best_predict)
sns.heatmap(lrtune_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('CV Logistic Regression Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
lr_cv_roc_auc = roc_auc_score(y_lr_test, y_lr_best_prob)
print(f"CV LR ROC-AUC Score: {lr_cv_roc_auc}")

# Plot ROC Curve
lr_cv_fpr, lr_cv_tpr, _ = roc_curve(y_lr_test, y_lr_best_prob)
plt.plot(lr_cv_fpr, lr_cv_tpr, label=f"CV LR ROC Curve (AUC = {lr_cv_roc_auc:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('CV Logistic Regression ROC Curve')
plt.legend()
plt.show()

In [None]:
#Extract coefficeints

cv_lr_coefficients = lr_best_model.named_steps['classifier'].coef_[0]


cv_lr_feature_importance = pd.DataFrame({
    'Feature_cv_lr': all_lr_feature_names,
    'Coefficient_cv_lr': cv_lr_coefficients,
    'Absolute Coefficient_cv_lr': np.abs(cv_lr_coefficients)
    })

cv_lr_feature_importance = cv_lr_feature_importance.sort_values(by='Absolute Coefficient_cv_lr', ascending=False)
cv_lr_feature_importance

In [None]:
#Visualising the important features
plt.figure(figsize=(12, 8))
sns.barplot(x='Absolute Coefficient_cv_lr', y='Feature_cv_lr', data=cv_lr_feature_importance)
plt.title('CV Logistic Regression Feature Importance')
plt.show()

### SVM MODEL

In [None]:
#Creating a copy of the dataset for SVM
svm_data = data.copy()
svm_data.head()

In [None]:
#Separating the SVM dataset into feature (X) and target (y) 

#Feature columns
X_svm = svm_data.drop(columns='y')

#Extracting target column and converting Yes and No labels to binary 1s and 0s 
y_svm = svm_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

In [None]:
svm_numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'month_int']
svm_categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']

In [None]:
#Scalling numerical features and encoding categorical features

svm_numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])

svm_categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

svm_preprocessor = ColumnTransformer(transformers=[('num', svm_numerical_transformer, svm_numerical_features),
                                               ('cat', svm_categorical_transformer, svm_categorical_features)
                                               ])

In [None]:
#Splitting the lr dataset

X_svm_train, X_svm_test, y_svm_train, y_svm_test = train_test_split(X_svm, y_svm, test_size=0.3, random_state=20, stratify=y_lr)

In [None]:
#defining the model

svm_model = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=20)

In [None]:
# Create an SVM pipeline
svm_pipeline = Pipeline(steps=[
    ('preprocessor', svm_preprocessor),
    ('svm', svm_model)
    ])

In [None]:
# Train with the model
svm_pipeline.fit(X_svm_train, y_svm_train)

In [None]:
#predicting with the svm pipeline model

y_svm_predict = svm_pipeline.predict(X_svm_test)

#Probabilities for ROC-AUC
y_svm_prob = svm_pipeline.predict_proba(X_svm_test)[:, 1]

print(classification_report(y_svm_test, y_svm_predict))

In [None]:
#svm confusion matrix
svm_conf_matrix = confusion_matrix(y_svm_test, y_svm_predict)
sns.heatmap(svm_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('SVM Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:

svm_roc_auc = roc_auc_score(y_svm_test, y_svm_prob)
print(f"SVM ROC-AUC Score: {svm_roc_auc}")

# Plot ROC Curve
fpr_svm, tpr_svm, _ = roc_curve(y_svm_test, y_svm_prob)
plt.plot(fpr_svm, tpr_svm, label=f"SVM ROC Curve (AUC = {svm_roc_auc:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SVM ROC Curve')
plt.legend()
plt.show()

In [None]:
# Extract feature names after preprocessing 
# Get feature names from the numerical and categorical transformers 
svm_categorical_feature_new = svm_pipeline.named_steps['preprocessor'].\
    named_transformers_['cat'].get_feature_names_out(svm_categorical_features)

# Combine all feature names 
all_svm_feature_names = np.concatenate([svm_numerical_features, svm_categorical_feature_new])

In [None]:
#Extract coefficeints

svm_coefficients = svm_pipeline.named_steps['svm'].coef_[0]

svm_feature_importance = pd.DataFrame({
    'Feature_svm': all_svm_feature_names,
    'Coefficient_svm': svm_coefficients,
    'Absolute Coefficient_svm': np.abs(svm_coefficients)
    })

svm_feature_importance = svm_feature_importance.sort_values(by='Absolute Coefficient_svm', ascending=False)
svm_feature_importance

In [None]:
#Visualising the important features
plt.figure(figsize=(12, 8))
sns.barplot(x='Absolute Coefficient_svm', y='Feature_svm', data=svm_feature_importance)
plt.title('SVM Feature Importance')
plt.show()

**Applying Hyperparameter tunning to the SVM model

In [None]:
# Define parameter grid for hyperparameter tuning
svm_param_grid = {
    'svm__C': [0.1, 1, 10, 100]
    }

In [None]:
#Defining the params
stack_param_grid = {
    #Random FOrest parameters in the base model pipeline
    'rf__rf__n_estimators': [100, 200, 300],
    'rf__rf__max_depth': [10, 20, 30],

    #Logistic Regression parameters in the base model pipeline
    'lr__lr__C': [0.1, 1, 10],

    #Final estimator parameters
    'final_estimator__C': [0.1, 1, 10]
  }

In [None]:
# Performing the SVM Grid Search CV
svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
svm_grid_search.fit(X_svm_train, y_svm_train)

In [None]:
#Best parameters and score

print(f"SVM Best Parameters: {svm_grid_search.best_params_}")
print(f"SVM Best Score: {svm_grid_search.best_score_}")

In [None]:
# Predict and evaluate

svm_best_model = svm_grid_search.best_estimator_
y_svm_best_pred = svm_best_model.predict(X_svm_test)
y_svm_best_prob = svm_best_model.predict_proba(X_svm_test)[:, 1]

print(classification_report(y_svm_test, y_svm_best_pred))

In [None]:
#svm tunned confusion matrix
svm_cv_cm = confusion_matrix(y_svm_test, y_svm_best_pred)
sns.heatmap(svm_cv_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('CV SVM Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
svm_cv_roc_auc = roc_auc_score(y_svm_test, y_svm_best_prob)
print(f"CV SVM ROC-AUC Score: {svm_cv_roc_auc}")

# Plot ROC Curve
svm_cv_fpr, svm_cv_tpr, _ = roc_curve(y_svm_test, y_svm_best_prob)
plt.plot(svm_cv_fpr, svm_cv_tpr, label=f"CV SVM ROC Curve (AUC = {svm_cv_roc_auc:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('CV SVM ROC Curve')
plt.legend()
plt.show()

In [None]:
#Extract coefficeints

cv_svm_coefficients = svm_best_model.named_steps['svm'].coef_[0]


cv_svm_feature_importance = pd.DataFrame({
    'Feature_cv_svm': all_svm_feature_names,
    'Coefficient_cv_svm': cv_svm_coefficients,
    'Absolute Coefficient_cv_svm': np.abs(cv_svm_coefficients)
    })

cv_svm_feature_importance = cv_svm_feature_importance.sort_values(by='Absolute Coefficient_cv_svm', ascending=False)
cv_svm_feature_importance

In [None]:
#Visualising the important features
plt.figure(figsize=(12, 8))
sns.barplot(x='Absolute Coefficient_cv_svm', y='Feature_cv_svm', data=cv_svm_feature_importance)
plt.title('CV SVM Feature Importance')
plt.show()

### STACKED RANDOM FOREST AND LOGISTIC REGRESSION ENSEMBLE

In [None]:
#Creating a copy of the dataset for the ensemble model
en_data = data.copy()
en_data.head()

In [None]:
#Separating the lr dataset into feature (X) and target (y) 

#Feature columns
X_en = en_data.drop(columns='y')

#Extracting target column and converting Yes and No labels to binary 1s and 0s 
y_en = en_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

In [None]:
en_numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'month_int']
en_categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']

In [None]:
#Scalling numerical features and encoding categorical features

en_numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])

en_categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

en_preprocessor = ColumnTransformer(transformers=[('num_en', en_numerical_transformer, en_numerical_features),
                                               ('cat_en', en_categorical_transformer, en_categorical_features)
                                               ])

In [None]:
#Pipeline for the base models

random_forest_model = Pipeline(steps=[
    ('en_preprocessor', en_preprocessor),
     ('rf', RandomForestClassifier(n_estimators =100, random_state=20))
     ])

logistic_regression_model = Pipeline(steps=[
    ('en_preprocessor', en_preprocessor),
     ('lr', logistic_model)
    ])

In [None]:
#combining the models together with stacking ensemble

stacking_clf = StackingClassifier(
    estimators = [
    ('rf', random_forest_model),
    ('lr', logistic_regression_model)
  ],
    final_estimator = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=20)
)


In [None]:
#Splitting dataset
X_train_en, X_test_en, y_train_en, y_test_en = train_test_split(X_en, y_en, test_size=0.3, random_state=20, stratify=y_en)

#Fitting the stacked model
stacking_clf.fit(X_train_en, y_train_en)

In [None]:
#Prediction with the ensemble
y_stack_predict = stacking_clf.predict(X_test_en)

print(classification_report(y_test_en, y_stack_predict))

In [None]:
#Confusion matrix for the ensemble
stack_conf_matrix = confusion_matrix(y_test_en, y_stack_predict)
sns.heatmap(stack_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Stacked Ensemble Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
y_en_stack_prob = stacking_clf.predict_proba(X_test_en)[:, 1]

en_roc_auc = roc_auc_score(y_test_en, y_en_stack_prob)
print(f"Stacked_en_ROC-AUC Score: {en_roc_auc}")

# Plot ROC Curve
en_fpr, en_tpr, _ = roc_curve(y_test_en, y_en_stack_prob)
plt.plot(en_fpr, en_tpr, label=f"ROC Curve (AUC = {en_roc_auc:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Stacked Ensembled ROC Curve')
plt.legend()
plt.show()

**Applying Hyperparameter tunning to the Ensemble Model

In [None]:
#Splitting the dataset for ensemble model
X_train_ent, X_test_ent, y_train_ent, y_test_ent = train_test_split(X_en, y_en, test_size=0.3, random_state=20, stratify=y_en)

In [None]:
#Defining the tuned stacked model
stack_grid_search = GridSearchCV(estimator=stacking_clf, param_grid=stack_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)


#Fit the Training data
stack_grid_search.fit(X_train_ent, y_train_ent)


print(f"Best Parameters: {stack_grid_search.best_params_}")
print(f"Best Score: {stack_grid_search.best_score_}")

In [None]:
stack_best_model = stack_grid_search.best_estimator_
stack_best_predict = stack_best_model.predict(X_test_ent)
stack_best_prob = stack_best_model.predict_proba(X_test_ent)[:, 1]


print(classification_report(y_test_ent, stack_best_predict))

In [None]:
stackgrid_conf_matrix = confusion_matrix(y_test_ent, stack_best_predict)
sns.heatmap(stackgrid_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('CV Stacked Ensemble Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:

ent_roc_auc = roc_auc_score(y_test_ent, stack_best_prob)
print(f"ROC-AUC Score: {ent_roc_auc}")

# Plot ROC Curve
ent_fpr, ent_tpr, _ = roc_curve(y_test_ent, stack_best_prob)
plt.plot(ent_fpr, ent_tpr, label=f"ROC Curve (AUC = {ent_roc_auc:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('CV Stacked Ensembled ROC Curve')
plt.legend()
plt.show()