# Telco Customer Churn

## Data Collection & Exploration

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
churn = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
churn

In [None]:
churn.isna().sum()

In [None]:
churn.info()

In [None]:
churn.hist(bins=60, figsize=(20,15))

In [None]:
numerical_cols = churn.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_df = churn[numerical_cols]
numerical_df.head(5)

In [None]:
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    for label in churn['Churn'].unique():
        subset = churn[churn['Churn'] == label]
        plt.hist(subset[col], bins=30, alpha=0.5, label=label)
    plt.title(f'{col} distribution by Churn')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.legend()
    plt.show()

In [None]:
categorical_cols = churn.select_dtypes(include=['object']).columns.tolist()
categorical_df = churn[categorical_cols]
categorical_df.head()

In [None]:
unique_counts = categorical_df.nunique()
print(unique_counts)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Find categorical columns with exactly 2 unique values (excluding customerID)
binary_cols = [col for col in categorical_cols if unique_counts[col] == 2 and col != 'customerID']

le = LabelEncoder()
for col in binary_cols:
    churn[col] = le.fit_transform(churn[col])

churn[binary_cols].head()
churn.head()

In [None]:
churn.info()

In [None]:
cols_to_check = [
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaymentMethod'
]

for col in cols_to_check:
    if col in churn.columns:
        print(f"{col}: {churn[col].unique()}")

In [None]:
# One-hot encode 'Contract' and 'PaymentMethod' with 1/0 values (not True/False)
contract_dummies = pd.get_dummies(churn['Contract'], prefix='Contract', dtype=int)
payment_dummies = pd.get_dummies(churn['PaymentMethod'], prefix='PaymentMethod', dtype=int)

# Concatenate the new columns to the original dataframe
churn = pd.concat([churn, contract_dummies, payment_dummies], axis=1)

# Show the new columns
churn.head()

In [None]:
# Select the columns of interest
cols_of_interest = [
    'PhoneService','MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

# Create a copy to avoid modifying the original dataframe
corr_df = churn[cols_of_interest].replace({
    'No': 0, 'Yes': 1,
    'No phone service': -1, 'No internet service': -1,
    'DSL': 0, 'Fiber optic': 1, 'No': -1
})

# For InternetService, map 'DSL' to 0, 'Fiber optic' to 1, 'No' to -1
corr_df['InternetService'] = churn['InternetService'].map({'DSL': 0, 'Fiber optic': 1, 'No': -1})

# Compute correlation matrix
corr_matrix = corr_df.corr()

# Plot the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Service Features')
plt.show()

In [None]:
churn["StreamingUser"] = np.where(
    (churn["StreamingTV"] == "Yes") | (churn["StreamingMovies"] == "Yes"), 1, 0
)

In [None]:
churn.drop(["StreamingTV", "StreamingMovies"], axis=1, inplace=True)

In [None]:
churn

In [None]:
cols_to_check = [
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaymentMethod'
]

for col in cols_to_check:
    if col in churn.columns:
        print(f"{col}: {churn[col].unique()}")

In [None]:
from scipy.stats import chi2_contingency

# Create contingency table
contingency_table = pd.crosstab(churn['MultipleLines'], churn['Churn'])

# Run Chi-Square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)
print(f"P-value: {p_value:.4f}")

In [None]:
pd.set_option('display.max_columns', None)
churn.head()

In [None]:
churn.drop(['Contract', 'PaymentMethod'], axis=1, inplace=True) 

In [None]:
churn

In [None]:
categorical_cols = churn.select_dtypes(include=['object']).columns.tolist()
categorical_df = churn[categorical_cols]
categorical_df.head()

In [None]:
# Convert 'TotalCharges' to numeric, coerce errors to NaN, then fill NaN with 0 and convert to int
churn['TotalCharges'] = pd.to_numeric(churn['TotalCharges'], errors='coerce').fillna(0).astype(int)

In [None]:
categorical_cols = churn.select_dtypes(include=['object']).columns.tolist()
categorical_df = churn[categorical_cols]
categorical_df.head()

In [None]:
# One-hot encode selected categorical columns with 1/0 values (not True/False)
cols_to_encode = [
    'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport'
]

encoded_dfs = [pd.get_dummies(churn[col], prefix=col, dtype=int) for col in cols_to_encode]
churn = pd.concat([churn] + encoded_dfs, axis=1)

# Show the new columns
churn.head()

In [None]:
churn.info()

In [None]:
churn.shape

In [None]:
churn.drop(['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport'], axis=1, inplace=True)

In [None]:
churn.shape

In [None]:
churn.drop(['customerID'],axis=1, inplace=True)

In [None]:
churn.info()

In [None]:
# Separate discrete (categorical/binary) and continuous (numerical) features
discrete_features = [col for col in churn.columns if churn[col].nunique() <= 10 and col != 'Churn']
continuous_features = [col for col in churn.columns if churn[col].nunique() > 10 and col != 'Churn']

# Visualize discrete features vs target
for col in discrete_features:
    plt.figure(figsize=(6, 3))
    sns.barplot(x=col, y='Churn', data=churn)
    plt.title(f'Churn Rate by {col}')
    plt.ylabel('Churn Rate')
    plt.xlabel(col)
    plt.show()

In [None]:
for col in continuous_features:
    plt.figure(figsize=(6, 4))
    sns.histplot(churn[col], bins=30, kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Visualize continuous features vs target
for col in continuous_features:
    plt.figure(figsize=(6, 3))
    sns.boxplot(x='Churn', y=col, data=churn)
    plt.title(f'{col} Distribution by Churn')
    plt.xlabel('Churn')
    plt.ylabel(col)
    plt.show()

In [None]:
# Visualize before fixing outliers by churn
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
sns.boxplot(x='Churn', y='tenure', data=churn, ax=axes[0, 0])
axes[0, 0].set_title('Tenure by Churn Before Outlier Fix')
sns.boxplot(x='Churn', y='TotalCharges', data=churn, ax=axes[0, 1])
axes[0, 1].set_title('TotalCharges by Churn Before Outlier Fix')

# Fix outliers using IQR method within each churn group
def fix_outliers_iqr_by_group(df, col, group_col='Churn', k=1.5):
    df = df.copy()
    for group in df[group_col].unique():
        mask = df[group_col] == group
        Q1 = df.loc[mask, col].quantile(0.25)
        Q3 = df.loc[mask, col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - k * IQR
        upper = Q3 + k * IQR
        df.loc[mask, col] = np.clip(df.loc[mask, col], lower, upper)
    return df

churn = fix_outliers_iqr_by_group(churn, 'tenure')
churn = fix_outliers_iqr_by_group(churn, 'TotalCharges')

# Visualize after fixing outliers by churn
sns.boxplot(x='Churn', y='tenure', data=churn, ax=axes[1, 0])
axes[1, 0].set_title('Tenure by Churn After Outlier Fix')
sns.boxplot(x='Churn', y='TotalCharges', data=churn, ax=axes[1, 1])
axes[1, 1].set_title('TotalCharges by Churn After Outlier Fix')
plt.tight_layout()
plt.show()


In [None]:
x_churn = churn.drop('Churn', axis=1)
X_notstanderized_churn = churn.drop('Churn', axis=1)
y_churn = churn[['Churn']]

In [None]:
x_churn

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
cols_to_scale = ['TotalCharges', 'MonthlyCharges', 'tenure']
x_churn[cols_to_scale] = scaler.fit_transform(x_churn[cols_to_scale])
x_churn.head()

## Model Training

### 1)Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_churn, y_churn, test_size=0.2, random_state=100)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train.values.ravel())

In [None]:
lr_y_train_pred = lr.predict(X_train)
lr_y_test_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
# Accuracy on TRAIN data
train_acc_lr = accuracy_score(y_train, lr_y_train_pred)
# Accuracy on TEST data
test_acc_lr = accuracy_score(y_test, lr_y_test_pred)

print(f"Train Accuracy: {train_acc_lr:.3f}")
print(f"Test Accuracy: {test_acc_lr:.3f}")

In [None]:
lr_y_test_pred

#### Metrics

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, lr_y_test_pred)
print(cm)

In [None]:
from sklearn.metrics import classification_report
lr_report = classification_report(y_test, lr_y_test_pred, output_dict=True)
lr_report_df = pd.DataFrame(lr_report).transpose()
lr_report_df

In [None]:
from sklearn.metrics import roc_auc_score
y_proba = lr.predict_proba(X_test)[:, 1]  # Probabilities for class 1
auc = roc_auc_score(y_test, y_proba)
print(f"AUC-ROC: {auc:.3f}")

In [None]:
# Visualize predicted vs actual Churn for both train and test sets

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Train set
sns.countplot(x='Churn', hue=lr_y_train_pred, data=y_train.assign(Pred=lr_y_train_pred), ax=axes[0])
axes[0].set_title('Train Data: Actual vs Predicted Churn')
axes[0].set_xlabel('Actual Churn')
axes[0].set_ylabel('Count')
axes[0].legend(title='Predicted')

# Test set
sns.countplot(x='Churn', hue=lr_y_test_pred, data=y_test.assign(Pred=lr_y_test_pred), ax=axes[1])
axes[1].set_title('Test Data: Actual vs Predicted Churn')
axes[1].set_xlabel('Actual Churn')
axes[1].set_ylabel('Count')
axes[1].legend(title='Predicted')

plt.tight_layout()
plt.show()

#### using grid search cv

In [None]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

In [None]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],  # Regularization type
    'C': [0.001, 0.01, 0.1, 1, 10, 100],   # Inverse regularization strength
    'solver': ['liblinear', 'saga'],        # Solvers that support L1/L2
    'class_weight': [None, 'balanced']      # Handle class imbalance
}

In [None]:
lr_grid_search = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    cv=5,                   # 5-fold cross-validation
    scoring='accuracy',      # Can also use 'f1', 'recall', 'precision'
    n_jobs=-1,              # Use all CPU cores
    verbose=1               # Shows progress
)

In [None]:
lr_grid_search.fit(X_train, y_train)

In [None]:
best_logreg = lr_grid_search.best_estimator_
best_params = lr_grid_search.best_params_

print("Best Parameters:", best_params)

In [118]:
y_pred = best_logreg.predict(X_test)

# Classification Report
print(classification_report(y_test, y_pred))

# Accuracy
print("Test Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1025
           1       0.62      0.51      0.56       384

    accuracy                           0.78      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.77      0.78      0.77      1409

Test Accuracy: 0.7821149751596878


In [121]:
lr_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.833945,0.886829,0.859574,1025.0
1,0.636364,0.528646,0.577525,384.0
accuracy,0.789212,0.789212,0.789212,0.789212
macro avg,0.735154,0.707738,0.71855,1409.0
weighted avg,0.780097,0.789212,0.782706,1409.0


### 2) Random Forest Classifier

In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_notstanderized_churn, y_churn, test_size=0.2, random_state=100)

In [102]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=2, random_state=100, n_jobs=-1)
rf.fit(X_train, y_train.values.ravel())

In [103]:
y_rf_train_pred = rf.predict(X_train)  # Class labels (0 or 1)
y_rf_test_pred = rf.predict(X_test)

In [104]:
from sklearn.metrics import accuracy_score
# Accuracy on TRAIN data
train_acc_rf = accuracy_score(y_train, lr_y_train_pred)
# Accuracy on TEST data
test_acc_rf = accuracy_score(y_test, lr_y_test_pred)

print(f"Train Accuracy: {train_acc_rf:.3f}")
print(f"Test Accuracy: {test_acc_rf:.3f}")

Train Accuracy: 0.805
Test Accuracy: 0.789


In [105]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_rf_test_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[1016    9]
 [ 346   38]]


In [106]:
rf_report = classification_report(y_test, y_rf_test_pred, output_dict=True)
rf_report_df = pd.DataFrame(rf_report).transpose()
rf_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.745962,0.99122,0.851278,1025.0
1,0.808511,0.098958,0.176334,384.0
accuracy,0.748048,0.748048,0.748048,0.748048
macro avg,0.777236,0.545089,0.513806,1409.0
weighted avg,0.763008,0.748048,0.667333,1409.0


In [99]:
from sklearn.metrics import roc_auc_score
y_rf_proba = rf.predict_proba(X_test)[:, 1]  # Probability of class 1
auc_rf = roc_auc_score(y_test, y_rf_proba)
print(f"AUC-ROC: {auc_rf:.3f}")

AUC-ROC: 0.819


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Train set
sns.countplot(x='Churn', hue=y_rf_train_pred, data=y_train.assign(Pred=y_rf_train_pred), ax=axes[0])
axes[0].set_title('Train Data: Actual vs Predicted Churn')
axes[0].set_xlabel('Actual Churn')
axes[0].set_ylabel('Count')
axes[0].legend(title='Predicted')

# Test set
sns.countplot(x='Churn', hue=y_rf_test_pred, data=y_test.assign(Pred=y_rf_test_pred), ax=axes[1])
axes[1].set_title('Test Data: Actual vs Predicted Churn')
axes[1].set_xlabel('Actual Churn')
axes[1].set_ylabel('Count')
axes[1].legend(title='Predicted')

plt.tight_layout()
plt.show()

#### using grid search cv on randomforest

In [107]:
param_grid = {
    'n_estimators': [100],          # Reduced from [50, 100, 200]
    'max_depth': [10, 20],          # Reduced from [None, 10, 20, 30]
    'min_samples_split': [5, 10],   # Focus on higher values to prevent overfitting
    'max_features': ['sqrt']        # Removed 'log2' option
}

In [108]:
rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                           # Reduced from 5 folds
    scoring='accuracy',
    n_jobs=1,                       # Use 1 core to avoid memory issues
    verbose=2
)

In [109]:
rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=10, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   0.5s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=10, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   0.4s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=10, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   1.9s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=10, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   0.4s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=10, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   0.4s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=10, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   1.9s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=20, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   0.5s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=20, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   0.4s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=20, max_features=sqrt, min_samples_split=5, n_estimators=100; total time=   2.0s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=20, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   0.4s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=20, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   0.4s


  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=20, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   1.9s


  return fit_method(estimator, *args, **kwargs)


In [110]:
best_rf = rf_grid_search.best_estimator_
best_params = rf_grid_search.best_params_

print("Best Parameters:", best_params)

Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}


In [111]:
y_pred = best_rf.predict(X_test)

# Classification Report
print(classification_report(y_test, y_pred))

# Accuracy
print("Test Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1025
           1       0.63      0.57      0.60       384

    accuracy                           0.79      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.79      0.79      1409

Test Accuracy: 0.7920511000709723
Confusion Matrix:
 [[897 128]
 [165 219]]


In [117]:
rf_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.745962,0.99122,0.851278,1025.0
1,0.808511,0.098958,0.176334,384.0
accuracy,0.748048,0.748048,0.748048,0.748048
macro avg,0.777236,0.545089,0.513806,1409.0
weighted avg,0.763008,0.748048,0.667333,1409.0


In [112]:
importances = best_rf.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print(feature_importance.head(10))

                           Feature  Importance
8                     TotalCharges    0.148271
4                           tenure    0.147590
9          Contract_Month-to-month    0.112991
7                   MonthlyCharges    0.095394
32                  TechSupport_No    0.055249
21     InternetService_Fiber optic    0.047453
23               OnlineSecurity_No    0.043086
14  PaymentMethod_Electronic check    0.035480
11               Contract_Two year    0.026541
26                 OnlineBackup_No    0.025883


### 3) XGBoost

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
# Train XGBoost model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=100)
xgb.fit(X_train, y_train.values.ravel())

In [None]:
# Predictions
y_xgb_train_pred = xgb.predict(X_train)
y_xgb_test_pred = xgb.predict(X_test)

In [None]:
# Predictions
y_xgb_train_pred = xgb.predict(X_train)
y_xgb_test_pred = xgb.predict(X_test)

In [None]:
# Accuracy

train_acc_xgb = accuracy_score(y_train, y_xgb_train_pred)
test_acc_xgb = accuracy_score(y_test, y_xgb_test_pred)
print(f"Train Accuracy: {train_acc_xgb:.3f}")
print(f"Test Accuracy: {test_acc_xgb:.3f}")

# Confusion Matrix
cm_xgb = confusion_matrix(y_test, y_xgb_test_pred)
print("Confusion Matrix:\n", cm_xgb)

# Classification Report
xgb_report = classification_report(y_test, y_xgb_test_pred, output_dict=True)
xgb_report_df = pd.DataFrame(xgb_report).transpose()
print(xgb_report_df)

# AUC-ROC
y_xgb_proba = xgb.predict_proba(X_test)[:, 1]
auc_xgb = roc_auc_score(y_test, y_xgb_proba)
print(f"AUC-ROC: {auc_xgb:.3f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Train set: Actual vs Predicted
sns.countplot(x='Churn', hue=y_xgb_train_pred, data=y_train.assign(Pred=y_xgb_train_pred), ax=axes[0])
axes[0].set_title('Train Data: Actual vs Predicted Churn (LightGBM)')
axes[0].set_xlabel('Actual Churn')
axes[0].set_ylabel('Count')
axes[0].legend(title='Predicted')

# Test set: Actual vs Predicted
sns.countplot(x='Churn', hue=y_xgb_test_pred, data=y_test.assign(Pred=y_xgb_test_pred), ax=axes[1])
axes[1].set_title('Test Data: Actual vs Predicted Churn (LightGBM)')
axes[1].set_xlabel('Actual Churn')
axes[1].set_ylabel('Count')
axes[1].legend(title='Predicted')

plt.tight_layout()
plt.show()

### 4) LightGBM

In [None]:
import lightgbm as lgb
# Train LightGBM model
lgbm = lgb.LGBMClassifier(random_state=100)
lgbm.fit(X_train, y_train.values.ravel())

In [None]:
# Predictions
y_lgb_train_pred = lgbm.predict(X_train)
y_lgb_test_pred = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
#Accuracy
train_acc_lgb = accuracy_score(y_train, y_lgb_train_pred)
test_acc_lgb = accuracy_score(y_test, y_lgb_test_pred)
print(f"Train Accuracy: {train_acc_lgb:.3f}")
print(f"Test Accuracy: {test_acc_lgb:.3f}")
# Confusion Matrix
cm_lgb = confusion_matrix(y_test, y_lgb_test_pred)
print("Confusion Matrix:\n", cm_lgb)

# Classification Report
lgb_report = classification_report(y_test, y_lgb_test_pred, output_dict=True)
lgb_report_df = pd.DataFrame(lgb_report).transpose()
print(lgb_report_df)

# AUC-ROC
y_lgb_proba = lgbm.predict_proba(X_test)[:, 1]
auc_lgb = roc_auc_score(y_test, y_lgb_proba)
print(f"AUC-ROC: {auc_lgb:.3f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Train set: Actual vs Predicted
sns.countplot(x='Churn', hue=lr_y_train_pred, data=y_train.assign(Pred=lr_y_train_pred), ax=axes[0])
axes[0].set_title('Train Data: Actual vs Predicted Churn')
axes[0].set_xlabel('Actual Churn')
axes[0].set_ylabel('Count')
axes[0].legend(title='Predicted')

# Test set: Actual vs Predicted
sns.countplot(x='Churn', hue=lr_y_test_pred, data=y_test.assign(Pred=lr_y_test_pred), ax=axes[1])
axes[1].set_title('Test Data: Actual vs Predicted Churn')
axes[1].set_xlabel('Actual Churn')
axes[1].set_ylabel('Count')
axes[1].legend(title='Predicted')

plt.tight_layout()
plt.show()

### Comparision of all models 

In [None]:
metrics_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM'],
    'Test Accuracy': [test_acc_lr,test_acc_rf,test_acc_xgb,test_acc_lgb],
    'Train Accuracy': [train_acc_lr, train_acc_rf, train_acc_xgb, train_acc_lgb],
    'AUC-ROC': [auc, auc_rf, auc_xgb, auc_lgb],
    
})
metrics_df

In [None]:
# Prepare a DataFrame with model names and their classification report main metrics (accuracy, precision, recall, f1-score)
model_names = ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM']
precision = [
    lr_report['weighted avg']['precision'],
    rf_report['weighted avg']['precision'],
    xgb_report['weighted avg']['precision'],
    lgb_report['weighted avg']['precision']
]
recall = [
    lr_report['weighted avg']['recall'],
    rf_report['weighted avg']['recall'],
    xgb_report['weighted avg']['recall'],
    lgb_report['weighted avg']['recall']
]
f1_score = [
    lr_report['weighted avg']['f1-score'],
    rf_report['weighted avg']['f1-score'],
    xgb_report['weighted avg']['f1-score'],
    lgb_report['weighted avg']['f1-score']
]

classification_summary = pd.DataFrame({
    'Model': model_names,
    'Precision': precision,
    'Recall': recall,
    'F1-score': f1_score
})

classification_summary

In [None]:
import numpy as np

# Prepare input sample as a DataFrame with correct columns
input_sample = np.array([[1, 0, 0, 0, 1.369912, 1, 1, 1.358961, 2.047611, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1]]) 
# Predict #0(No)
lr_prediction = lr.predict(input_sample)  
print(f"Predicted Churn by LR:")
if lr_prediction==1:
    print("Yes")
else:
    print("No")

# Prepare input sample2 as a DataFrame with correct columns
input_sample2 = np.array([[1, 0, 0, 0, 66.0, 1, 1, 105.65, 6844.0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1]])

# Predict using Random Forest
rf_prediction = rf.predict(input_sample2)  
print(f"Predicted Churn by RF:")
if rf_prediction==1:
    print("Yes")
else:
    print("No")

# Predict using XGBoost
xgb_prediction = xgb.predict(input_sample2)  
print(f"Predicted Churn by XGB:")
if xgb_prediction==1:
    print("Yes")
else:
    print("No")

# Predict using LGB
lgb_prediction = lgbm.predict(input_sample2)  
print(f"Predicted Churn by LGB:")
if lgb_prediction==1:
    print("Yes")
else:
    print("No")    

In [None]:
x_churn

In [None]:
y_churn

In [None]:
X_notstanderized_churn