In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, average_precision_score,
                             classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [50]:
#path, file and target
BASE_DIR  = r"/Users/binga17/Downloads/MSBA/3rd sem/Financial Analytics/Assignments"
FILE_NAME = "online_retail_customer_churn.xlsx"   
TARGET_COL = "Target_Churn"

In [36]:
#loading the data
path = os.path.join(BASE_DIR, FILE_NAME)
if path.lower().endswith((".xlsx",".xls")):
    df = pd.read_excel(path)
elif path.lower().endswith(".csv"):
    df = pd.read_csv(path)
else:
    raise ValueError("Use .xlsx/.xls or .csv")

print("Shape:", df.shape)
print("Columns:", list(df.columns)[:20])



Shape: (1000, 15)
Columns: ['Customer_ID', 'Age', 'Gender', 'Annual_Income', 'Total_Spend', 'Years_as_Customer', 'Num_of_Purchases', 'Average_Transaction_Amount', 'Num_of_Returns', 'Num_of_Support_Contacts', 'Satisfaction_Score', 'Last_Purchase_Days_Ago', 'Email_Opt_In', 'Promotion_Response', 'Target_Churn']


In [38]:
#making the target binary
def to01(v):
    if pd.isna(v): return np.nan
    s = str(v).strip().lower()
    if s in {"1","yes","true","churn","left"}: return 1
    if s in {"0","no","false","stay","active"}: return 0
    try:
        return int(float(s)) if s in {"0","1"} else np.nan
    except:
        return np.nan

y = df[TARGET_COL].map(to01)
df = df.loc[~y.isna()].copy()
y  = y.loc[~y.isna()].astype(int)

In [40]:
#features
X = df.drop(columns=[TARGET_COL])

id_like = {"customerid","customer_id","id","invoice","invoiceno","InvoiceNo"}
X = X.drop(columns=[c for c in X.columns if c.lower() in id_like], errors="ignore")

num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

if num_cols:
    X[num_cols] = X[num_cols].fillna(X[num_cols].median())
for c in cat_cols:
    if X[c].isna().any():
        mode_val = X[c].mode(dropna=True)
        X[c] = X[c].fillna(mode_val.iloc[0] if not mode_val.empty else "Unknown")

X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
print("Prepared shape:", X.shape)

Prepared shape: (1000, 15)


In [42]:
#split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [44]:
#models
log_clf = LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")
rf_clf  = RandomForestClassifier(n_estimators=300, random_state=42,
                                 class_weight="balanced", min_samples_leaf=2)

In [46]:
#train
log_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

In [48]:
#evaluate
def eval_model(name, model):
    y_pred  = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    metrics = {
        "model": name,
        "accuracy":  accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall":    recall_score(y_test, y_pred, zero_division=0),
        "f1":        f1_score(y_test, y_pred, zero_division=0),
        "roc_auc":   roc_auc_score(y_test, y_proba),
        "pr_auc":    average_precision_score(y_test, y_proba)
    }
    print(f"\n=== {name} ===")
    print(pd.Series(metrics))
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=3))
    return metrics

res = [eval_model("Logistic Regression", log_clf),
       eval_model("Random Forest", rf_clf)]

import pandas as pd
res_df = pd.DataFrame(res).sort_values(["pr_auc","roc_auc","f1"], ascending=False)
print("\n>>> Model comparison (best at top):")
print(res_df)

print(f"\n>>> Recommended model: {res_df.iloc[0]['model']} (based on PR-AUC, then ROC-AUC)")


=== Logistic Regression ===
model        Logistic Regression
accuracy                   0.485
precision               0.510204
recall                   0.47619
f1                      0.492611
roc_auc                 0.464561
pr_auc                  0.506212
dtype: object

Classification report:
              precision    recall  f1-score   support

           0      0.461     0.495     0.477        95
           1      0.510     0.476     0.493       105

    accuracy                          0.485       200
   macro avg      0.485     0.485     0.485       200
weighted avg      0.487     0.485     0.485       200


=== Random Forest ===
model        Random Forest
accuracy             0.515
precision         0.532258
recall            0.628571
f1                0.576419
roc_auc           0.488421
pr_auc            0.540946
dtype: object

Classification report:
              precision    recall  f1-score   support

           0      0.487     0.389     0.433        95
           1    