In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Load the dataset
dataset = pd.read_csv('/kaggle/input/churnppp/BankChurners.csv')

# Step 2: Drop unnecessary columns (Naive Bayes, CLIENTNUM, and Attrition_Flag)
dataset = dataset.iloc[:, :-2]  # Drop the last two columns (Naive Bayes)
attrition_flag = dataset['Attrition_Flag'].apply(lambda x: 1 if x == 'Attrited Customer' else 0)  # Target column
dataset = dataset.drop(columns=['Attrition_Flag', 'CLIENTNUM'])

# Step 3: Convert categorical features to numeric using One-Hot Encoding
dataset = pd.get_dummies(dataset, drop_first=True)

# Step 4: Select features
selected_features = [
     'Total_Trans_Amt','Total_Amt_Chng_Q4_Q1','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1',
     'Customer_Age','Total_Revolving_Bal','Total_Relationship_Count',
     'Avg_Open_To_Buy', 'Avg_Utilization_Ratio','Contacts_Count_12_mon',
     'Months_Inactive_12_mon','Marital_Status_Single'
]

X = dataset[selected_features]
y = attrition_flag

# Step 5: Split the data into training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Initialize individual models
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
lgbm_model = LGBMClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42, max_iter=500)

# Step 7: Create an ensemble model with VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgbm', lgbm_model),
        ('lr', lr_model)
    ],
    voting='soft'  # 'soft' uses probabilities for voting; 'hard' uses majority vote
)

# Step 8: Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Step 9: Make predictions on the test set
y_pred_ensemble = ensemble_model.predict(X_test)

# Step 10: Evaluate the ensemble model
accuracy = accuracy_score(y_test, y_pred_ensemble)
precision = precision_score(y_test, y_pred_ensemble)
recall = recall_score(y_test, y_pred_ensemble)
f1 = f1_score(y_test, y_pred_ensemble)

# Print metrics
print("Ensemble Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


[LightGBM] [Info] Number of positive: 1131, number of negative: 5957
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1719
[LightGBM] [Info] Number of data points in the train set: 7088, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.159565 -> initscore=-1.661465
[LightGBM] [Info] Start training from score -1.661465
Ensemble Model Performance:
Accuracy: 0.9684
Precision: 0.9149
Recall: 0.8891
F1-Score: 0.9018
