In [None]:
#Importing the neccessary libraries and reading the data
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df['is_train'] = 1
test_df['is_train'] = 0

combined_df = pd.concat([train_df, test_df], axis=0)
#Mapping the categorical variables to numerical variables
combined_df['Gender'] = combined_df['Gender'].map({'Male': 0, 'Female': 1})
combined_df['Job Role'] = combined_df['Job Role'].map({'Education': 0, 'Media': 1, 'Healthcare': 2, 'Technology': 3, 'Finance': 4})
combined_df['Work-Life Balance'] = combined_df['Work-Life Balance'].map({'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3})
combined_df['Job Satisfaction'] = combined_df['Job Satisfaction'].map({'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3})
combined_df['Performance Rating'] = combined_df['Performance Rating'].map({'Low': 0, 'Below Average': 1, 'Average': 2, 'High': 3})
combined_df['Overtime'] = combined_df['Overtime'].map({'No': 0, 'Yes': 1})
combined_df['Education Level'] = combined_df['Education Level'].map({
    "High School": 0,
    "Associate Degree": 1,
    "Bachelor’s Degree": 2,
    "Master’s Degree": 3,
    "PhD": 4
})
combined_df['Marital Status'] = combined_df['Marital Status'].map({'Single': 0, 'Married': 1, 'Divorced': 2})
combined_df['Job Level'] = combined_df['Job Level'].map({'Entry': 0, 'Mid': 1, 'Senior': 2})
combined_df['Company Size'] = combined_df['Company Size'].map({'Small': 0, 'Medium': 1, 'Large': 2})
combined_df['Remote Work'] = combined_df['Remote Work'].map({'No': 0, 'Yes': 1})
combined_df['Leadership Opportunities'] = combined_df['Leadership Opportunities'].map({'No': 0, 'Yes': 1})
combined_df['Innovation Opportunities'] = combined_df['Innovation Opportunities'].map({'No': 0, 'Yes': 1})
combined_df['Company Reputation'] = combined_df['Company Reputation'].map({'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3})
combined_df['Employee Recognition'] = combined_df['Employee Recognition'].map({'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3})
combined_df['Attrition'] = combined_df['Attrition'].map({'Left': 0, 'Stayed': 1})


#Adding Derived Variables
combined_df['Tenure_Ratio'] = combined_df['Years at Company'] / (combined_df['Company Tenure'] + 1)
combined_df['Promotion_Rate'] = combined_df['Number of Promotions'] / (combined_df['Years at Company'] + 1)
combined_df['Distance_Category'] = pd.cut(combined_df['Distance from Home'], bins=[-1, 5, 15, 30, 1000],
                                          labels=[0, 1, 2, 3])
combined_df['Age_Group'] = pd.cut(combined_df['Age'], bins=[17, 30, 40, 50, 60],
                                  labels=[0, 1, 2, 3])
combined_df['Income_per_Level'] = combined_df['Monthly Income'] / (combined_df['Job Level'] + 1)
combined_df['Work_Life_Satisfaction'] = (combined_df['Work-Life Balance'] + combined_df['Job Satisfaction']) / 2
combined_df['Performance_Recognition_Score'] = (combined_df['Performance Rating'] + combined_df['Employee Recognition']) / 2
combined_df['Education_Income_ratio'] = combined_df['Monthly Income'] / (combined_df['Education Level'] + 1)
combined_df['Age_Joblevel']= combined_df['Age']/(combined_df['Job Level']+1)




# Drop Unnecessary Columns (optional)
if 'EmployeeId' in combined_df.columns:
    combined_df.drop(columns=['EmployeeId'], inplace=True)

# Split back
train_df = combined_df[combined_df['is_train'] == 1].drop(columns=['is_train'])
test_df = combined_df[combined_df['is_train'] == 0].drop(columns=['is_train','Attrition'])

X = train_df.drop(columns=['Attrition'])
y = train_df['Attrition'].astype(int)

#Scale and split the data to train and test splits(80:20)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)


X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


#Importing models and metrics

In [None]:
# Define all models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Function to evaluate and print metrics
def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    print(f"\n📊 {name} Results")
    print("-" * 40)
    print("Accuracy:", accuracy_score(y_val, preds))
    print("Precision:", precision_score(y_val, preds))
    print("Recall:", recall_score(y_val, preds))
    print("F1 Score:", f1_score(y_val, preds))
    print("Confusion Matrix:\n", confusion_matrix(y_val, preds))
    print("Classification Report:\n", classification_report(y_val, preds))

# Loop through models and evaluate
for name, model in models.items():
    evaluate_model(name, model, X_train, y_train, X_val, y_val)



📊 Logistic Regression Results
----------------------------------------
Accuracy: 0.7229865771812081
Precision: 0.7349148224804968
Recall: 0.7382056612825844
F1 Score: 0.7365565661400989
Confusion Matrix:
 [[4002 1665]
 [1637 4616]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.71      0.71      5667
           1       0.73      0.74      0.74      6253

    accuracy                           0.72     11920
   macro avg       0.72      0.72      0.72     11920
weighted avg       0.72      0.72      0.72     11920


📊 K-Nearest Neighbors Results
----------------------------------------
Accuracy: 0.6711409395973155
Precision: 0.6830378157853444
Recall: 0.6961458499920038
F1 Score: 0.689529542214478
Confusion Matrix:
 [[3647 2020]
 [1900 4353]]
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.64      0.65      5667
           1       0.68      0.70      0.69      62

In [None]:
combined_df

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Employee Recognition,Attrition,is_train,Tenure_Ratio,Promotion_Rate,Distance_Category,Age_Group,Income_per_Level,Work_Life_Satisfaction,Performance_Recognition_Score
0,8410,31,0,19,0,5390,3,1,2,2,...,1,1,1,0.211111,0.100000,2,1,2695.0,2.0,1.5
1,64756,59,1,4,1,5534,0,2,0,3,...,0,1,1,0.181818,0.600000,2,3,2767.0,1.0,0.0
2,30257,24,1,10,2,8159,2,2,0,0,...,0,1,1,0.133333,0.000000,1,0,4079.5,2.0,0.0
3,65791,36,1,7,0,3989,2,2,3,1,...,1,1,1,0.137255,0.125000,2,1,1994.5,2.0,2.0
4,65026,56,0,41,0,4821,1,3,2,0,...,1,1,1,0.594203,0.000000,3,3,1607.0,2.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,16243,56,1,42,2,7830,0,1,2,0,...,1,1,0,0.688525,0.000000,3,3,2610.0,0.5,1.5
14896,47175,30,1,15,0,3856,2,1,2,2,...,1,0,0,0.714286,0.125000,3,0,3856.0,1.5,1.5
14897,12409,52,0,5,0,5654,2,3,1,0,...,2,0,0,0.625000,0.000000,0,3,2827.0,2.5,1.5
14898,9554,18,0,4,0,5276,1,2,2,0,...,2,1,0,0.666667,0.000000,1,0,2638.0,1.5,2.0


In [None]:
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier,BaggingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
#Importing ensemble models

In [None]:
ensemble_models = {
    "AdaBoost": AdaBoostClassifier(n_estimators=150, random_state=42,learning_rate=1),
    "XGBoost": xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss',max_depth=3),
    "LightGBM": lgb.LGBMClassifier(n_estimators=50, learning_rate=0.1, random_state=42,num_leaves=31),
    "Bagging": BaggingClassifier(n_estimators=100, random_state=42),

    "Voting Ensemble (Soft)": VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(max_iter=1000)),
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('gb', GradientBoostingClassifier(n_estimators=100))
        ],
        voting='soft',weights=[1,2,1]
    )
}
#Defining models

In [None]:
for name, model in ensemble_models.items():
    evaluate_model(name, model, X_train, y_train, X_val, y_val)
#Function to loop and evaluate the models


📊 AdaBoost Results
----------------------------------------
Accuracy: 0.7572147651006711
Precision: 0.7673937271135169
Recall: 0.7708300015992323
F1 Score: 0.7691080261688208
Confusion Matrix:
 [[4206 1461]
 [1433 4820]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.74      0.74      5667
           1       0.77      0.77      0.77      6253

    accuracy                           0.76     11920
   macro avg       0.76      0.76      0.76     11920
weighted avg       0.76      0.76      0.76     11920



Parameters: { "use_label_encoder" } are not used.




📊 XGBoost Results
----------------------------------------
Accuracy: 0.7561241610738255
Precision: 0.7690575747828884
Recall: 0.7647529185990725
F1 Score: 0.7668992061582872
Confusion Matrix:
 [[4231 1436]
 [1471 4782]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.75      0.74      5667
           1       0.77      0.76      0.77      6253

    accuracy                           0.76     11920
   macro avg       0.76      0.76      0.76     11920
weighted avg       0.76      0.76      0.76     11920





[LightGBM] [Info] Number of positive: 25007, number of negative: 22671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1938
[LightGBM] [Info] Number of data points in the train set: 47678, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524498 -> initscore=0.098069
[LightGBM] [Info] Start training from score 0.098069





📊 LightGBM Results
----------------------------------------
Accuracy: 0.7536073825503355
Precision: 0.7678513731825525
Recall: 0.7601151447305293
F1 Score: 0.7639636743550591
Confusion Matrix:
 [[4230 1437]
 [1500 4753]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.75      0.74      5667
           1       0.77      0.76      0.76      6253

    accuracy                           0.75     11920
   macro avg       0.75      0.75      0.75     11920
weighted avg       0.75      0.75      0.75     11920


📊 Bagging Results
----------------------------------------
Accuracy: 0.7327181208053691
Precision: 0.7511875511875512
Recall: 0.7334079641771949
F1 Score: 0.7421912930894967
Confusion Matrix:
 [[4148 1519]
 [1667 4586]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.73      0.72      5667
           1       0.75      0.73      0.74      6253

    accuracy      

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for each ensemble model
param_grids = {
    "AdaBoost": {
        'n_estimators': [50, 150, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    "XGBoost": {
        'n_estimators': [50, 120, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    "LightGBM": {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100]
    },

    "Voting Ensemble (Soft)": {

        'weights': [[1, 1, 1], [1, 2, 1], [2, 1, 1]]
    }
}

In [None]:
tuned_models = {}

for name, model in ensemble_models.items():
    if name in param_grids:
        print(f"\n Tuning {name}...")
        grid_search = GridSearchCV(model, param_grids[name], cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        print(f"Best parameters for {name}: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy for {name}: {grid_search.best_score_}")

        tuned_models[name] = grid_search.best_estimator_
    else:
        print(f"\n skipping tuning for {name} as no parameter grid is defined.")
        tuned_models[name] = model # Use the default model if no grid is defined

#Using GridSearchCV to find the optimal hyperparameters for the ensemble models


🔬 Tuning AdaBoost...
Best parameters for AdaBoost: {'learning_rate': 1, 'n_estimators': 150}
Best cross-validation accuracy for AdaBoost: 0.7592180873552787

🔬 Tuning XGBoost...


Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best cross-validation accuracy for XGBoost: 0.7577709146314163

🔬 Tuning LightGBM...




[LightGBM] [Info] Number of positive: 25007, number of negative: 22671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1579
[LightGBM] [Info] Number of data points in the train set: 47678, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524498 -> initscore=0.098069
[LightGBM] [Info] Start training from score 0.098069
Best parameters for LightGBM: {'learning_rate': 0.1, 'n_estimators': 50, 'num_leaves': 31}
Best cross-validation accuracy for LightGBM: 0.755316899843887

 skipping tuning for Bagging as no parameter grid is defined.

🔬 Tuning Voting Ensemble (Soft)...
Best parameters for Voting Ensemble (Soft): {'weights': [1, 2, 1]}
Best cross-validation accuracy for Voting Ensemble (Soft): 0.7522756824868541


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import VotingClassifier


# Define parameter grids for all models
param_grids_ml = {
    "Logistic Regression": {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'max_iter': [100, 150, 200]
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan','minkowski']
    },
    "Naive Bayes": {}, # GaussianNB generally doesn't have hyperparameters to tune
    "Decision Tree": {
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy','log_loss']
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10, 15]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2]
    }

}

#Defining parameter grid for the ML models

In [None]:
tuned_models_ml = {}

for name, model in models.items():
    if name in param_grids_ml:
        print(f"\n🔬 Tuning {name}...")
        grid_search = GridSearchCV(model, param_grids_ml[name], cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        print(f"Best parameters for {name}: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy for {name}: {grid_search.best_score_}")

        tuned_models_ml[name] = grid_search.best_estimator_
    else:
        print(f"\n skipping tuning for {name} as no parameter grid is defined.")
        tuned_models_ml[name] = model # Use the default model if no grid is defined

#Finding optimal hyperparameter for ML models


🔬 Tuning Logistic Regression...


54 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Best parameters for Logistic Regression: {'C': 0.001, 'max_iter': 100, 'penalty': 'l2'}
Best cross-validation accuracy for Logistic Regression: 0.7324551885275105

🔬 Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best cross-validation accuracy for K-Nearest Neighbors: 0.6951424556558802

🔬 Tuning Naive Bayes...
Best parameters for Naive Bayes: {}
Best cross-validation accuracy for Naive Bayes: 0.7191996354490527

🔬 Tuning Decision Tree...
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2}
Best cross-validation accuracy for Decision Tree: 0.7284910960704801

🔬 Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': 15, 'n_estimators': 200}
Best cross-validation accuracy for Random Forest: 0.746864348678545

🔬 Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.2, 'n_estimators': 100}
Best cross-valid