In [4]:
import numpy as np 
import pylab as plt
import pandas as pd
#from collections import Counter
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score


In [2]:
file_path = r"C:\Users\marcu\OneDrive\Desktop\NUS\NUS Y3S2\DSA3101\Project\Churn_Modelling.csv"
df = pd.read_csv(file_path)

#print(df.head())

df = df.drop(columns=['RowNumber','CustomerId','Surname'])
#print(df.head())

df= pd.get_dummies(df,columns=["Geography","Gender"],drop_first=True)
#print(df.head())

label_counts = df['Exited'].value_counts()
total_samples = len(df)
class_percentages = (label_counts / total_samples) * 100
print("\nClass Percentages:\n", class_percentages)


x = df.drop(columns='Exited')
y = df['Exited']


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=3101)


Class Percentages:
 Exited
0    79.63
1    20.37
Name: count, dtype: float64


# Logistic regression

In [6]:
log_reg = LogisticRegression(class_weight='balanced', solver='liblinear')

log_reg.fit(x_train, y_train)

cv_scores = cross_val_score(log_reg, x_train, y_train, cv=10, scoring='accuracy')
#print(cv_scores.mean())

y_pred = log_reg.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
recall = recall_score(y_test, y_pred)
print(recall)

0.697
0.6842105263157895


In [8]:
log_reg = LogisticRegression(
    penalty='l1',  #lasso
    class_weight='balanced',  
    solver='liblinear',  
    C=1.0)

log_reg.fit(x_train, y_train)

cv_scores = cross_val_score(log_reg, x_train, y_train, cv=10, scoring='accuracy')
#print(cv_scores.mean())

y_pred = log_reg.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
recall = recall_score(y_test, y_pred)
print(recall)

0.712
0.6842105263157895


In [9]:
log_reg = LogisticRegression(
    penalty='l2',  #ridge
    class_weight='balanced',  
    solver='liblinear',  
    C=1.0)

log_reg.fit(x_train, y_train)

cv_scores = cross_val_score(log_reg, x_train, y_train, cv=10, scoring='accuracy')
#print(cv_scores.mean())

y_pred = log_reg.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
recall = recall_score(y_test, y_pred)
print(recall)

0.697
0.6842105263157895


# Random forest

In [14]:
rf = RandomForestClassifier(class_weight='balanced_subsample', random_state=3101)
model = rf.fit(x_train,y_train)

cv_scores = cross_val_score(model, x_train, y_train, cv=10, scoring='recall')
#print(cv_scores.mean())

accuracy = model.score(x_test, y_test)
print(accuracy)
recall = recall_score(y_test, model.predict(x_test))
print(recall)

0.8665
0.48621553884711777


In [12]:
rf = RandomForestClassifier(class_weight='balanced', random_state=3101)

param_grid = {
    'n_estimators': [50, 100, 200],  #number of trees
    'max_depth': [None, 10, 20],  #tree depth
    'min_samples_split': [2, 5, 10],  #min samples to split a node
    'min_samples_leaf': [1, 2, 5]  #min samples per leaf
}

grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='recall', n_jobs=-1)
grid_search.fit(x_train, y_train)

best_rf = grid_search.best_estimator_

cv_scores = cross_val_score(best_rf, x_train, y_train, cv=10, scoring='recall')
#print(cv_scores.mean())

y_pred = best_rf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
recall = recall_score(y_test, y_pred)
print(recall)


KeyboardInterrupt: 

In [23]:
smote = SMOTE(sampling_strategy=0.5, random_state=3101)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)


rf = RandomForestClassifier(class_weight='balanced', random_state=3101)

param_grid = {
    'n_estimators': [50, 100, 200],  #number of trees
    'max_depth': [None, 10, 20],  #tree depth
    'min_samples_split': [2, 5, 10],  #min samples to split a node
    'min_samples_leaf': [1, 2, 5]  #min samples per leaf
}

grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_resampled, y_resampled)

best_rf = grid_search.best_estimator_

cv_scores = cross_val_score(best_rf, x_resampled, y_resampled, cv=10, scoring='accuracy')
print(cv_scores.mean())

y_pred = best_rf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8539377874367503
0.8525


In [None]:

import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, recall_score
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=3101)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

# Compute class imbalance ratio for scale_pos_weight
neg, pos = y_resampled.value_counts()
scale_pos_weight = neg / pos  # Helps XGBoost & LightGBM handle imbalance

# Define XGBoost Model
xgb_model = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=10,
    learning_rate=0.05,
    scale_pos_weight=scale_pos_weight,  # Handles imbalance
    random_state=3101,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Define LightGBM Model
lgb_model = lgb.LGBMClassifier(
    n_estimators=400,
    max_depth=10,
    learning_rate=0.05,
    scale_pos_weight=scale_pos_weight,  # Handles imbalance
    random_state=3101
)

# Train models
xgb_model.fit(x_resampled, y_resampled)
lgb_model.fit(x_resampled, y_resampled)

# Predictions
y_pred_xgb = xgb_model.predict(x_test)
y_pred_lgb = lgb_model.predict(x_test)

# Evaluate models
print("XGBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))

print("\nLightGBM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("Recall:", recall_score(y_test, y_pred_lgb))



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Users\marcu\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 3181, number of negative: 6362
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 9543, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
XGBoost Results:
Accuracy: 0.8405
Recall: 0.6140350877192983

LightGBM Results:
Accuracy: 0.8235
Recall: 0.6641604010025063
