In [1]:
import pandas as pd

df = pd.read_csv("Churn_Modelling.csv")

df.shape
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
df["Exited"].value_counts(normalize=True)


Exited
0    0.7963
1    0.2037
Name: proportion, dtype: float64

In [4]:
df.isnull().sum()


RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
df.dropna(inplace=True)


In [6]:
df.drop(columns=["CustomerId", "Surname"], inplace=True)


In [7]:
df[["Geography", "Gender"]].value_counts()


Geography  Gender
France     Male      2753
           Female    2261
Spain      Male      1388
Germany    Male      1316
           Female    1193
Spain      Female    1089
Name: count, dtype: int64

In [8]:
df = pd.get_dummies(
    df,
    columns=["Geography", "Gender"],
    drop_first=True
)


In [9]:
df["Tenure_Age_Ratio"] = (
    df["Tenure"] / (df["Age"] + 1)
)


In [10]:
from sklearn.preprocessing import StandardScaler

X = df.drop("Exited", axis=1)
y = df["Exited"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [13]:
pip install xgboost


Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/3a/d8/4d4ae25452577f2dfabc66b60e712e7c01f9fe6c389fa88c546c2f427c4d/xgboost-3.1.3-py3-none-win_amd64.whl.metadata
  Downloading xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB 108.9 kB/s eta 0:11:01
   ---------------------------------------- 0.1/72.0 MB 363.1 kB/s eta 0:03:19
   ---------------------------------------- 0.3/72.0 MB 1.8 MB/s eta 0:00:40
   ---------------------------------------- 0.9/72.0 MB 3.7 MB/s eta 0:00:19
    --------------------------------------- 1.8/72.0 MB 6.0 MB/s eta 0:00:12
   - -------------------------------------- 2.9/7

In [16]:
# ---------------------------------------
# STEP 4: MODEL BUILDING, TUNING & EVALUATION
# ---------------------------------------

import os
import pandas as pd
import joblib

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# ---------------------------------------
# 4.1 Define Models (Recall-Optimized)
# ---------------------------------------

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ),

    "Decision Tree": DecisionTreeClassifier(
        max_depth=6,
        class_weight="balanced",
        random_state=42
    ),

    "KNN": KNeighborsClassifier(
        n_neighbors=9,
        weights="distance"
    ),

    "Naive Bayes": GaussianNB(),

    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced",
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        eval_metric="logloss",
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
        random_state=42
    )
}

# ---------------------------------------
# 4.2 Train Models & Evaluate (Threshold Tuned)
# ---------------------------------------

results = []
trained_models = {}

THRESHOLD = 0.4  # tuned threshold to improve recall

for model_name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    trained_models[model_name] = model

    # Probabilities
    y_prob = model.predict_proba(X_test)[:, 1]

    # Threshold tuning
    y_pred = (y_prob >= THRESHOLD).astype(int)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([
        model_name,
        acc,
        auc,
        precision,
        recall,
        f1,
        mcc
    ])

# ---------------------------------------
# 4.3 Create Comparison Table
# ---------------------------------------

results_df = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy",
        "AUC",
        "Precision",
        "Recall",
        "F1 Score",
        "MCC"
    ]
)

print("\nModel Performance Comparison (Tuned for Recall):\n")
print(results_df)





Model Performance Comparison (Tuned for Recall):

                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression    0.6125  0.775583   0.324427  0.835381  0.467354   
1        Decision Tree    0.7575  0.831398   0.444602  0.769042  0.563456   
2                  KNN    0.8255  0.773991   0.596667  0.439803  0.506365   
3          Naive Bayes    0.8035  0.787455   0.519126  0.466830  0.491591   
4        Random Forest    0.8570  0.847899   0.694534  0.530713  0.601671   
5              XGBoost    0.8050  0.834690   0.516505  0.653563  0.577007   

        MCC  
0  0.315146  
1  0.441377  
2  0.410240  
3  0.371032  
4  0.523361  
5  0.457843  


In [18]:
# ---------------------------------------
# 4.4 Save Models & Artifacts
# ---------------------------------------

import os
import joblib

os.makedirs("model", exist_ok=True)

joblib.dump(trained_models, "model/churn_models.pkl")
joblib.dump(scaler, "model/scaler.pkl")
joblib.dump(X.columns.tolist(), "model/feature_names.pkl")
joblib.dump(results_df, "model/model_metrics.pkl")
joblib.dump(THRESHOLD, "model/decision_threshold.pkl")



['model/decision_threshold.pkl']

ModuleNotFoundError: No module named 'streamlit'

In [21]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split

# ---------------------------------------
# 1. Load raw dataset
# ---------------------------------------
df = pd.read_csv("Churn_Modelling.csv")

# ---------------------------------------
# 2. Drop identifier columns
# ---------------------------------------
df.drop(columns=["CustomerId", "Surname"], inplace=True)

# ---------------------------------------
# 3. Feature Engineering
# ---------------------------------------
df["Balance_to_Salary_Ratio"] = df["Balance"] / (df["EstimatedSalary"] + 1)
df["Tenure_Age_Ratio"] = df["Tenure"] / (df["Age"] + 1)

# ---------------------------------------
# 4. Encode categorical variables
# ---------------------------------------
df = pd.get_dummies(df, columns=["Geography", "Gender"], drop_first=True)

# ---------------------------------------
# 5. Load feature names used during training
# ---------------------------------------
feature_names = joblib.load("model/feature_names.pkl")

# ---------------------------------------
# 6. Ensure column consistency
# ---------------------------------------
for col in feature_names:
    if col not in df.columns:
        df[col] = 0  # add missing columns

df = df[feature_names + ["Exited"]]

# ---------------------------------------
# 7. Create TEST dataset only (20%)
# ---------------------------------------
_, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Exited"],
    random_state=42
)

# ---------------------------------------
# 8. Save processed test CSV
# ---------------------------------------
test_df.to_csv("processed_test_data.csv", index=False)

print("✅ processed_test_data.csv created successfully!")
print("Upload this file to Streamlit app.")


✅ processed_test_data.csv created successfully!
Upload this file to Streamlit app.
