In [None]:
# suppress warnings to keep the notebook clean (like HW1)
import warnings
warnings.filterwarnings("ignore")

# data / utils
import pandas as pd
import numpy as np

# modeling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibrationDisplay
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, classification_report, brier_score_loss
)

# neighbors for the recommendation section
from sklearn.neighbors import NearestNeighbors

# Homework 2
## Data

In [None]:
# === DATA LOADING ===
# Update these if your instructor posted different paths.
CHURN_URL = "https://raw.githubusercontent.com/ywen2021/CPSC392/main/Data/streaming.csv"
NEW_CUSTOMERS_URL = "https://raw.githubusercontent.com/ywen2021/CPSC392/refs/heads/main/Data/streamingNEW.csv"
# Optional: a favorites dataset. If not provided, we'll still do KNN on demographics/usage.
FAVORITES_URL = "https://raw.githubusercontent.com/ywen2021/CPSC392/refs/heads/main/Data/streamingFILMS.csv"

# read data
churn = pd.read_csv(CHURN_URL)
new_customers = pd.read_csv(NEW_CUSTOMERS_URL)
try:
    favorites = pd.read_csv(FAVORITES_URL)
except Exception:
    favorites = None  # if not available, we'll skip film-preference features

# quick clean
for df in (churn, new_customers):
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

# (optional) make sure plan is treated as ordered categorical if it exists
if "plan" in churn.columns:
    plan_order = pd.CategoricalDtype(categories=["T", "P", "A", "B"], ordered=True)
    churn["plan"] = churn["plan"].astype(plan_order)
    if "plan" in new_customers.columns:
        new_customers["plan"] = new_customurers_plan = new_customers["plan"].astype(plan_order)

churn.head()

## 1. Modeling

In [None]:
# === FEATURE SETUP ===
# Use everything EXCEPT the target 'churn' as X.
assert "churn" in churn.columns, "Expected a 'churn' column (0/1)."

X = churn.drop(columns="churn")
y = churn["churn"].astype(int)

# identify column types
possible_cats = ["gender", "plan", "topgenre", "secondgenre"]
categorical = [c for c in possible_cats if c in X.columns]

# treat "binary" 0/1 cols as numeric; everything else that's number-like is numeric
numeric = [c for c in X.columns if c not in categorical]

# build preprocessor to one-hot the categorical and scale numeric where useful
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric),  # sparse safe scaling
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ],
    remainder="drop"
)

# === TRAIN / TEST SPLIT (90/10, stratified) ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y
)

# === MODELS ===
logit = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None, solver="lbfgs"))
])

gb = Pipeline([
    ("prep", preprocess),
    ("clf", GradientBoostingClassifier(random_state=42))
])

models = {
    "Logistic Regression": logit,
    "Gradient Boosting": gb
}

def eval_model(name, model, Xtr, ytr, Xte, yte):
    model.fit(Xtr, ytr)
    # predicted probabilities for ROC AUC & calibration
    if hasattr(model, "predict_proba"):
        p_tr = model.predict_proba(Xtr)[:, 1]
        p_te = model.predict_proba(Xte)[:, 1]
    else:
        # fallback if decision_function exists
        p_tr = model.decision_function(Xtr)
        p_te = model.decision_function(Xte)

    yhat_tr = (p_tr >= 0.5).astype(int)
    yhat_te = (p_te >= 0.5).astype(int)

    metrics = {
        "train_acc": accuracy_score(ytr, yhat_tr),
        "train_prec": precision_score(ytr, yhat_tr, zero_division=0),
        "train_rec": recall_score(ytr, yhat_tr, zero_division=0),
        "train_auc": roc_auc_score(ytr, p_tr),
        "test_acc": accuracy_score(yte, yhat_te),
        "test_prec": precision_score(yte, yhat_te, zero_division=0),
        "test_rec": recall_score(yte, yhat_te, zero_division=0),
        "test_auc": roc_auc_score(yte, p_te),
        "brier_test": brier_score_loss(yte, p_te)
    }
    print(f"\n{name} RESULTS")
    print("-"*60)
    print(f"TRAIN -> Acc {metrics['train_acc']:.3f} | Prec {metrics['train_prec']:.3f} | "
          f"Rec {metrics['train_rec']:.3f} | AUC {metrics['train_auc']:.3f}")
    print(f"TEST  -> Acc {metrics['test_acc']:.3f} | Prec {metrics['test_prec']:.3f} | "
          f"Rec {metrics['test_rec']:.3f} | AUC {metrics['test_auc']:.3f} | "
          f"Brier {metrics['brier_test']:.3f}")
    return metrics, p_te, model

results = {}
probs_test = {}
fitted = {}

for name, m in models.items():
    metrics, p_te, mdl = eval_model(name, m, X_train, y_train, X_test, y_test)
    results[name] = metrics
    probs_test[name] = p_te
    fitted[name] = mdl

# pick best by TEST AUC
best_name = max(results, key=lambda k: results[k]["test_auc"])
best_model = fitted[best_name]
print(f"\nChosen model (by test AUC): {best_name}")

# === CALIBRATION (test set) ===
# Show calibration curves in the output cell so you can paste into your report.
try:
    import matplotlib.pyplot as plt
    plt.figure()
    for name in models.keys():
        CalibrationDisplay.from_predictions(
            y_test, probs_test[name], n_bins=10, name=name
        )
    plt.title("Calibration Curves (Test)")
    plt.show()
except Exception as e:
    print("Calibration plot skipped:", e)

## 2. Recommendation System

In [None]:
# Use the chosen model to score NEW customers, then find the 200 highest-risk.
probs_new = best_model.predict_proba(new_customers)[:, 1]
new_scored = new_customers.copy()
new_scored["pred"] = probs_new

# top 200 high-risk customers
top_k = 200 if len(new_scored) >= 200 else len(new_scored)
high_risk = new_scored.nlargest(top_k, "pred").copy()

# === KNN NEIGHBORS ===
# As instructed, use age, income, meanhourwatched to compute neighbors.
knn_features = [c for c in ["age", "income", "meanhourwatched"] if c in churn.columns]
assert len(knn_features) == 3, "Expected columns age, income, meanhourwatched to exist."

# Fit neighbors on the FULL CUSTOMER BASE (trainable space = existing churn dataset)
scaler_knn = StandardScaler()
base_X = scaler_knn.fit_transform(churn[knn_features].values)

# Build KNN model (10 neighbors). We'll include more (11) and drop self if overlap happens.
k = 10
nbrs = NearestNeighbors(n_neighbors=k, metric="euclidean").fit(base_X)

# For each high-risk NEW customer, find 10 most similar existing customers.
def neighbor_ids_for_row(row):
    vec = scaler_knn.transform([row[knn_features].values])
    distances, indices = nbrs.kneighbors(vec, n_neighbors=k)
    # return the index positions; if your churn DataFrame has a user id column, map it here
    # e.g., churn.loc[indices[0], "user_id"].tolist()
    return indices[0].tolist()

high_risk["neighbors"] = high_risk.apply(neighbor_ids_for_row, axis=1)

# Show the first few rows like the assignment example (pred + neighbors list)
high_risk_display_cols = [c for c in [
    "gender","age","income","monthssubbed","plan","meanhourwatched",
    "competitorsub","numprofiles","cancelled","downgraded","bundle","kids",
    "longestsession","topgenre","secondgenre","pred","neighbors"
] if c in high_risk.columns]

high_risk[high_risk_display_cols].head(6)