# Model Evaluation and Hyperparameter Tuning Project

In this project, we evaluate multiple machine learning models on a classification task and employ techniques like resampling to handle class imbalance. We compare the models based on accuracy, precision, recall, and F1 score.

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier  # Importing XGBClassifier
import lightgbm as lgb  # Importing LightGBM
from lightgbm import LGBMClassifier
import warnings

warnings.filterwarnings('ignore')
df = pd.read_csv("../dataset/updated_bank.csv");

## Data Preparation

The data is split into training and test sets. To handle class imbalance, we use **SMOTE combined with Edited Nearest Neighbors (ENN)**, an oversampling technique that creates synthetic samples and removes overlapping samples.

In [2]:
X = df.drop("y", axis=1);
y = df["y"]

label_enc = LabelEncoder()

for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_enc.fit_transform(X[column])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [3]:
# Handle class imbalance using SMOTE combined with ENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)
print(f"Resampled training set has {len(y_train_res)} samples with {sum(y_train_res == 1)} positives and {sum(y_train_res == 0)} negatives.")

Resampled training set has 2882 samples with 0 positives and 0 negatives.


## Model Training

We initialize six models and train them on the resampled training data. The models include Random Forest, Gradient Boosting, Decision Tree, XGBoost, LightGBM, and CatBoost.

In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Initialize models
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()
dt_model = DecisionTreeClassifier()
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
lgbm_model = LGBMClassifier()
catboost_model = CatBoostClassifier(verbose=0)

# Fit models on resampled training set
rf_model.fit(X_train_res, y_train_res)
gb_model.fit(X_train_res, y_train_res)
dt_model.fit(X_train_res, y_train_res)
xgb_model.fit(X_train_res, y_train_res)
lgbm_model.fit(X_train_res, y_train_res)
catboost_model.fit(X_train_res, y_train_res)

# Dictionary of models
models = {
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model,
    'Decision Tree': dt_model,
    'XGBoost': xgb_model,
    'LightGBM': lgbm_model,
    'CatBoost': catboost_model
}

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['no' 'yes']

## Model Evaluation

We define a function to evaluate each model on the test set, calculating accuracy, precision, recall, and F1 score for the positive class. The results will help us compare each model's effectiveness after addressing class imbalance with SMOTE-ENN.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model_performance(models, X_test, y_test):
    # Initialize a dictionary to store metrics for each model
    performance_data = {
        'Model': [],
        'Accuracy': [],
        'Precision (Yes)': [],
        'Recall (Yes)': [],
        'F1 Score (Yes)': []
    }

    for model_name, model in models.items():
        # Generate predictions
        y_pred = model.predict(X_test)

        # Calculate metrics with integer label for positive class
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label=1)
        recall = recall_score(y_test, y_pred, pos_label=1)
        f1 = f1_score(y_test, y_pred, pos_label=1)

        # Append the metrics to the performance data
        performance_data['Model'].append(model_name)
        performance_data['Accuracy'].append(accuracy)
        performance_data['Precision (Yes)'].append(precision)
        performance_data['Recall (Yes)'].append(recall)
        performance_data['F1 Score (Yes)'].append(f1)

    # Convert the dictionary to a DataFrame for easy viewing
    performance_df = pd.DataFrame(performance_data)
    return performance_df

# Evaluate and display performance summary
performance_summary = evaluate_model_performance(models, X_test, y_test)
performance_summary

## Conclusions

After handling class imbalance with SMOTE-ENN, the models have become more capable of detecting the positive class. The following conclusions can be drawn:

- **Accuracy** slightly decreased due to resampling but still remains relatively high.
- **Recall** improved significantly across models, particularly in Gradient Boosting and LightGBM.
- **Precision** is lower due to the trade-off with recall, which is expected with SMOTE-ENN.

**LightGBM** and **CatBoost** offer the best balance of precision, recall, and F1 score.

Further tuning of thresholds or ensembling may enhance precision while maintaining recall.

In [None]:
# Model 1: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


conf_matrix_random_forest = confusion_matrix(y_test, y_pred_rf);conf_matrix_random_forest

In [None]:
report_random_forest = classification_report(y_test, y_pred_rf)

print(report_random_forest)

In [None]:
# Model 2: Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

conf_matrix_gradient_boosting = confusion_matrix(y_test, y_pred_gb);conf_matrix_gradient_boosting

In [None]:
report_gradient_boosting = classification_report(y_test, y_pred_gb)

print(report_gradient_boosting)

In [None]:
# Model 3: Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

conf_matrix_decision_tree = confusion_matrix(y_test, y_pred_dt)
report_decision_tree = classification_report(y_test, y_pred_dt)

print("Decision Tree Classifier Report:")
print(report_decision_tree)


 # Ensemble

In [None]:
# Create a Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model)
    ],
    voting='hard'  # Use 'soft' for probabilistic voting if desired
)

# Fit the Voting Classifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the Voting Classifier
conf_matrix_voting = confusion_matrix(y_test, y_pred_voting)
report_voting = classification_report(y_test, y_pred_voting)

print("Voting Classifier Report:")
print(report_voting)


In [None]:

# Create a Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model)
    ],
    final_estimator=LogisticRegression()  # Meta-learner
)

# Fit the Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Make predictions
y_pred_stacking = stacking_clf.predict(X_test)

# Evaluate the Stacking Classifier
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
report_stacking = classification_report(y_test, y_pred_stacking)

print("Stacking Classifier Report:")
print(report_stacking)


In [None]:
# Load the data
df = pd.read_csv("../dataset/updated_bank.csv")
X = df.drop("y", axis=1)
y = df["y"]

# Label Encoding for categorical features
label_enc = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_enc.fit_transform(X[column])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle class imbalance using SMOTE combined with ENN
smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

# Initialize models with class weights
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
gb_model = GradientBoostingClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')
log_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Voting Classifier as an alternative ensemble method
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model),
        ('log', log_model)
    ],
    voting='soft'  # Using soft voting to improve performance
)

# Fit the Voting Classifier
voting_clf.fit(X_train_res, y_train_res)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the Voting Classifier
print("Voting Classifier Report:")
print(classification_report(y_test, y_pred_voting))




In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

# Convert boolean columns to integer (0 and 1)
X['contact_cellular'] = X['contact_cellular'].astype(int)
X['contact_telephone'] = X['contact_telephone'].astype(int)
X['contact_unknown'] = X['contact_unknown'].astype(int)

# Label Encoding for categorical features in X (features)
label_enc = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_enc.fit_transform(X[column])

# Label encode the target variable y, if it's categorical (e.g., 'yes', 'no')
if y.dtype == 'object':
    y = label_enc.fit_transform(y)

# Split the dataset after ensuring it's numeric
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Handle class imbalance using SMOTE combined with ENN
smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

# Initialize the XGBoost model with scale_pos_weight to handle class imbalance
pos_class_weight = (len(y_train_res) - sum(y_train_res)) / sum(y_train_res)
xgb_model = XGBClassifier(random_state=42, scale_pos_weight=pos_class_weight)

# Fit the XGBoost model
xgb_model.fit(X_train_res, y_train_res)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))


In [None]:
import lightgbm as lgb
from sklearn.metrics import classification_report

# Initialize the LightGBM model with class_weight='balanced'
lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced')

# Fit the LightGBM model
lgb_model.fit(X_train_res, y_train_res)

# Make predictions
y_pred_lgb = lgb_model.predict(X_test)

# Evaluate the model
print("LightGBM Classifier Report:")
print(classification_report(y_test, y_pred_lgb))


In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Initialize the CatBoost model with auto_class_weights
cat_model = CatBoostClassifier(random_state=42, auto_class_weights='Balanced', verbose=0)

# Fit the CatBoost model
cat_model.fit(X_train_res, y_train_res)

# Make predictions
y_pred_cat = cat_model.predict(X_test)

# Evaluate the model
print("CatBoost Classifier Report:")
print(classification_report(y_test, y_pred_cat))


# Hyperparameter Tuning

In [None]:
# Define parameter grid for each model
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20],
    'dt__max_depth': [None, 10, 20],
    'log__C': [0.01, 0.1, 1, 10],
}

# Create the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model),
        ('log', log_model)
    ],
    voting='soft'
)

# Set up GridSearchCV
grid_search = GridSearchCV(voting_clf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Evaluate the best estimator
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Print classification report for the best model
print("Best Voting Classifier Report:")
print(classification_report(y_test, y_pred_best))


## Conclusion
### The model performs well in predicting the no class but struggles significantly with the yes class. The low precision and recall for yes suggest that improvements are needed to enhance the model’s ability to identify the minority class effectively.

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Oversampling with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Define a more comprehensive parameter grid
param_grid = {
    'rf__n_estimators': [50, 100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30],
    'dt__max_depth': [None, 10, 20, 30],
    'log__C': [0.001, 0.01, 0.1, 1, 10],
    'gb__n_estimators': [100, 200],
}

# Voting classifier with weighted estimators
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('dt', dt_model),
        ('log', log_model)
    ],
    voting='soft',
    weights=[1, 2, 1, 1]  # Adjust weights based on prior performance
)

# Set up GridSearchCV with a broader grid
grid_search = GridSearchCV(voting_clf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

# Check results
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


# Test Set


In [None]:


# Initialize models
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()
dt_model = DecisionTreeClassifier()
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
lgbm_model = LGBMClassifier()
catboost_model = CatBoostClassifier(verbose=0)

# Fit models (assuming you have X_train, y_train)
rf_model.fit(X_train_res, y_train_res)
gb_model.fit(X_train_res, y_train_res)
dt_model.fit(X_train_res, y_train_res)
xgb_model.fit(X_train_res,y_train_res)
lgbm_model.fit(X_train_res, y_train_res)
catboost_model.fit(X_train_res, y_train_res)

# Dictionary of models
models = {
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "Decision Tree": dt_model,
    "XGBoost": xgb_model,
    "LightGBM": lgbm_model,
    "CatBoost": catboost_model
}

# Evaluation function
def evaluate_model_performance(models, X_test, y_test):
    # Initialize a dictionary to store metrics for each model
    performance_data = {
        "Model": [],
        "Accuracy": [],
        "Precision (Yes)": [],
        "Recall (Yes)": [],
        "F1 Score (Yes)": []
    }

    for model_name, model in models.items():
        # Generate predictions
        y_pred = model.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label=1)
        recall = recall_score(y_test, y_pred, pos_label=1)
        f1 = f1_score(y_test, y_pred, pos_label=1)

        # Append the metrics to the performance data
        performance_data["Model"].append(model_name)
        performance_data["Accuracy"].append(accuracy)
        performance_data["Precision (Yes)"].append(precision)
        performance_data["Recall (Yes)"].append(recall)
        performance_data["F1 Score (Yes)"].append(f1)

    # Convert the dictionary to a DataFrame for easy viewing
    performance_df = pd.DataFrame(performance_data)
    return performance_df

# Evaluate and display performance summary
performance_summary = evaluate_model_performance(models, X_test, y_test)
print(performance_summary)
