# ML Pipeline: Ensemble Voting Classifier for Binary Classification

## 🔍 Overview

This notebook implements a robust machine learning pipeline for binary classification using an ensemble of XGBoost, LightGBM, and CatBoost models. The pipeline includes data preprocessing, model training, and performance evaluation.

### ⚠️ Execution Time

The complete execution of this notebook may take up to **10-15 minutes** due to the training of multiple complex models.

### 🔑 Key Components:

1. **Data Loading**: Train and test data from CSV files
2. **Preprocessing**:
   - Handling of categorical and numerical features
   - Imputation of missing values
   - Feature scaling using RobustScaler
3. **Models**:
   - XGBoost
   - LightGBM
   - CatBoost
4. **Ensemble Method**: Soft Voting Classifier
5. **Evaluation Metrics**:
   - Accuracy, Precision, Recall, F1 Score
   - ROC AUC Score
   - Confusion Matrix

### 📊 Workflow:

1. Load and preprocess data
2. Train individual models (XGBoost, LightGBM, CatBoost)
3. Create and train Voting Classifier
4. Make predictions on test set
5. Evaluate model performance
6. Save trained model and preprocessing objects

### 💾 Outputs:

- Printed performance metrics
- Saved model files:
  - `voting_classifier_model.joblib`
  - `cat_imputer.joblib`
  - `num_imputer.joblib`
  - `scaler.joblib`

### 🛠 Usage:

1. Ensure all required libraries are installed
2. Run all cells sequentially
3. Review the printed performance metrics
4. Use the saved model files for future predictionss for future predictions


In [2]:
#requirments
!pip install scikit-learn catboost xgboost lightgbm joblib



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import joblib

import warnings
warnings.filterwarnings('ignore')

# Load all available data
X_train = pd.read_csv('Train_60/X_Train_Data_Input.csv')
Y_train = pd.read_csv('Train_60/Y_Train_Data_Target.csv')
X_test = pd.read_csv('Test_20/X_Test_Data_Input.csv')
Y_test = pd.read_csv('Test_20/Y_Test_Data_Target.csv')

# Dropping 'ID' columns
X_train = X_train.drop(columns=['ID'], axis=1)
Y_train = Y_train.drop(columns=['ID'], axis=1)
X_test = X_test.drop(columns=['ID'], axis=1)
Y_test = Y_test.drop(columns=['ID'], axis=1)

#categorical and numerical columns
categorical_columns = ['Column0', 'Column1', 'Column2', 'Column3', 'Column4', 'Column10', 
                       'Column11', 'Column12', 'Column13', 'Column16', 'Column17', 'Column18', 
                       'Column19', 'Column20', 'Column21']
numerical_columns = ['Column5', 'Column6', 'Column7', 'Column8', 'Column9', 'Column14', 'Column15']

# Define preprocessing steps
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
scaler = RobustScaler()

# Preprocess the data
X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train[categorical_columns]), columns=categorical_columns)
X_train_num = pd.DataFrame(scaler.fit_transform(num_imputer.fit_transform(X_train[numerical_columns])), columns=numerical_columns)
X_train_preprocessed = pd.concat([X_train_cat, X_train_num], axis=1)

X_test_cat = pd.DataFrame(cat_imputer.transform(X_test[categorical_columns]), columns=categorical_columns)
X_test_num = pd.DataFrame(scaler.transform(num_imputer.transform(X_test[numerical_columns])), columns=numerical_columns)
X_test_preprocessed = pd.concat([X_test_cat, X_test_num], axis=1)

# Define base models with best parameters
xgb_model = XGBClassifier(
    n_estimators=762,
    learning_rate=0.06750561675312018,
    max_depth=9,
    min_child_weight=6,
    subsample=0.6271839566577789,
    colsample_bytree=0.7746518755079177,
    gamma=4.109529387719718,
    use_label_encoder=False,
    eval_metric='logloss'
)

lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.015263583672650079,
    num_leaves=417,
    max_depth=8,
    subsample=0.9977381407918132,
    colsample_bytree=0.8023937905666044
)

cat_model = CatBoostClassifier(
    iterations=1079,
    learning_rate=0.030348621362567964,
    depth=8,
    l2_leaf_reg=2.54042633051254e-05,
    bagging_temperature=0.7404633768033837,
    random_strength=1.9367791836957116e-06,
    verbose=0
)

# Fit the models
xgb_model.fit(X_train_preprocessed, Y_train)
lgb_model.fit(X_train_preprocessed, Y_train)
cat_model.fit(X_train_preprocessed, Y_train)

# Calculating F1 scores for individual models
#models = [('XGBoost', xgb_model), ('LightGBM', lgb_model), ('CatBoost', cat_model)]
#for name, model in models:
#    y_pred = model.predict(X_test_preprocessed)
#    f1 = f1_score(Y_test, y_pred)
#    print(f"{name} F1 Score: {f1:.4f}")

# Create Voting Classifier with tuned models
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model)
    ],
    voting='soft'
)

# Fit the Voting Classifier
voting_clf.fit(X_train_preprocessed, Y_train)

# Make predictions
y_pred = voting_clf.predict(X_test_preprocessed)
y_pred_proba = voting_clf.predict_proba(X_test_preprocessed)[:, 1]

# Calculate metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc_auc = roc_auc_score(Y_test, y_pred_proba)
conf_matrix = confusion_matrix(Y_test, y_pred)

print("\nVoting Classifier Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

# Save the model and preprocessing objects
joblib.dump(voting_clf, 'voting_classifier_model.joblib')
joblib.dump(cat_imputer, 'cat_imputer.joblib')
joblib.dump(num_imputer, 'num_imputer.joblib')
joblib.dump(scaler, 'scaler.joblib')
print("\nModel and preprocessing objects saved.")


  File "C:\Users\Abhay\anaconda3\envs\rag\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\Abhay\anaconda3\envs\rag\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Abhay\anaconda3\envs\rag\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\Abhay\anaconda3\envs\rag\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Number of positive: 74033, number of negative: 711100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2525
[LightGBM] [Info] Number of data points in the train set: 785133, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.094294 -> initscore=-2.262302
[LightGBM] [Info] Start training from score -2.262302
[LightGBM] [Info] Number of positive: 74033, number of negative: 711100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2525
[LightGBM] [Info] Number of data points in the train set: 785133, number of used features: 22
[LightGBM] [In

# 🎯 Prediction Function for Validation Data

## Purpose:

This cell contains a function to load the trained model, make predictions on new data, and save the results.

## Key Components:

1. `predict_on_new_data(new_data)` function:

   - Loads the saved model and preprocessing objects
   - Preprocesses new data
   - Makes predictions and calculates probabilities
   - Returns full results and predictions only

2. Output Generation:
   - Saves full predictions (including original data) to 'predictions_output.csv'
   - Saves only predicted classes to 'predicted_classes_only.csv'

## Usage:

1. Replace 'Test_20/X_Test_Data_Input.csv' with your validation data path
2. Run the cell to generate predictions
3. Check the output files in your working directory

Note: Ensure all required model files (joblib files) are in the same directory as this notebook.


In [8]:
# Function to load the model and make predictions on new data
import pandas as pd
import joblib

X_test = pd.read_csv('Test_20/X_Test_Data_Input.csv')  # Just replace this path with validation data 

def predict_on_new_data(new_data):
    # Load the saved model and preprocessing objects
    model = joblib.load('voting_classifier_model.joblib')
    cat_imputer = joblib.load('cat_imputer.joblib')
    num_imputer = joblib.load('num_imputer.joblib')
    scaler = joblib.load('scaler.joblib')
    
    categorical_columns = ['Column0', 'Column1', 'Column2', 'Column3', 'Column4', 'Column10', 
                           'Column11', 'Column12', 'Column13', 'Column16', 'Column17', 'Column18', 
                           'Column19', 'Column20', 'Column21']
    numerical_columns = ['Column5', 'Column6', 'Column7', 'Column8', 'Column9', 'Column14', 'Column15']
    
    # Preprocess new data
    new_data_cat = pd.DataFrame(cat_imputer.transform(new_data[categorical_columns]), columns=categorical_columns)
    new_data_num = pd.DataFrame(scaler.transform(num_imputer.transform(new_data[numerical_columns])), columns=numerical_columns)
    new_data_preprocessed = pd.concat([new_data_cat, new_data_num], axis=1)
    
    # Making predictions
    predictions = model.predict(new_data_preprocessed)
    probabilities = model.predict_proba(new_data_preprocessed)[:, 1]
    
    # Adding predictions and probabilities to the original dataframe
    new_data['Predicted_Class'] = predictions
    new_data['Predicted_Probability'] = probabilities
    
    return new_data, predictions

# Example usage:
new_data_with_predictions, predictions_only = predict_on_new_data(X_test)

# Save full predictions (including original data)
new_data_with_predictions.to_csv('predictions_output.csv', index=False)

# Save only the predicted classes
pd.DataFrame({'Predicted_Class': predictions_only}).to_csv('predicted_classes_only.csv', index=False)

print("Full predictions saved to 'predictions_output.csv'")
print("Predicted classes only saved to 'predicted_classes_only.csv'")

Full predictions saved to 'predictions_output.csv'
Predicted classes only saved to 'predicted_classes_only.csv'
