In [None]:
# ====== STEP 1: Install & Imports ======
!pip install -q scikit-learn imbalanced-learn pandas numpy matplotlib seaborn joblib

import pandas as pd, numpy as np, joblib, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix, RocCurveDisplay
from google.colab import files

# ====== STEP 2: Load Data (upload your CSV) ======
try:
    df = pd.read_csv("Telco-Customer-Churn.csv")
except FileNotFoundError:
    print("Upload your Telco churn CSV (e.g., Telco-Customer-Churn.csv)")
    uploaded = files.upload()
    fname = list(uploaded.keys())[0]
    df = pd.read_csv(fname)

print("Rows, Cols:", df.shape)
df.head()

# ====== STEP 3: Basic Cleaning ======
# Known issue: TotalCharges sometimes has spaces -> coerce to numeric
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop obvious ID-like columns if present
for col in ["customerID", "CustomerID", "CustomerId"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Drop rows with all-null target or missing critical fields
df = df.dropna(subset=[c for c in ["Churn"] if c in df.columns])

# Convert target to binary {0,1}
if df["Churn"].dtype != np.number:
    df["Churn"] = df["Churn"].astype(str).str.strip().str.lower().map({"yes":1, "no":0}).fillna(df["Churn"])

y = df["Churn"].astype(int)
X = df.drop(columns=["Churn"])

# Identify numeric / categorical
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)

# ====== STEP 4: Preprocess + Models as Pipelines ======
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

log_reg = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=None))
])

rf = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", RandomForestClassifier(class_weight="balanced", random_state=42))
])

# ====== STEP 5: Train/Validation Split ======
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ====== STEP 6: Hyperparameter Search (GridSearchCV) ======
param_grid_logreg = {
    "clf__C": [0.1, 1.0, 3.0],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "liblinear"]
}
gs_logreg = GridSearchCV(log_reg, param_grid_logreg, scoring="roc_auc", cv=5, n_jobs=-1)
gs_logreg.fit(X_train, y_train)

param_grid_rf = {
    "clf__n_estimators": [200, 400],
    "clf__max_depth": [None, 8, 16],
    "clf__min_samples_split": [2, 5],
}
gs_rf = GridSearchCV(rf, param_grid_rf, scoring="roc_auc", cv=5, n_jobs=-1)
gs_rf.fit(X_train, y_train)

# ====== STEP 7: Pick Best Model ======
best_model, best_name = (gs_logreg.best_estimator_, "LogisticRegression")
best_auc = gs_logreg.best_score_

if gs_rf.best_score_ > best_auc:
    best_model, best_name = (gs_rf.best_estimator_, "RandomForest")
    best_auc = gs_rf.best_score_

print(f"Best CV Model: {best_name} | CV ROC-AUC: {best_auc:.4f}")

# ====== STEP 8: Evaluate on Test ======
probs = best_model.predict_proba(X_test)[:,1]
preds = (probs >= 0.5).astype(int)

acc = accuracy_score(y_test, preds)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, preds, average="binary", zero_division=0)
auc = roc_auc_score(y_test, probs)

print(f"Test Accuracy: {acc:.4f}")
print(f"Test Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
print(f"Test ROC-AUC: {auc:.4f}")

cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.show()

RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.title("ROC Curve")
plt.show()

# ====== STEP 9: Save Pipeline ======
joblib.dump(best_model, "churn_pipeline.joblib")
print("Saved: churn_pipeline.joblib")

# ====== STEP 10: Example Inference ======
sample = X_test.iloc[[0]].copy()
print("Sample input row:\n", sample)
print("Predicted churn probability:", best_model.predict_proba(sample)[:,1][0])



Upload your Telco churn CSV (e.g., Telco-Customer-Churn.csv)
