# Week 5 — Churn Prediction (Easy Version)

In [None]:
!pip install -q pycaret==3.3.2



[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd

TRAIN_CSV = r"C:\Users\chigu\churn_project\churn_data_cleaned.csv"
df = pd.read_csv(TRAIN_CSV)

TARGET = "Churn_Yes"
df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce").fillna(0).astype(int)

drop_cols = [c for c in df.columns if c.lower().startswith("customerid")]
if drop_cols:
    df = df.drop(columns=drop_cols)

print("Data reloaded. Shape:", df.shape)


Data reloaded. Shape: (7043, 11)


In [None]:
import pandas as pd

TARGET = "Churn_Yes"
assert TARGET in df.columns, f"Expected '{TARGET}' in columns, got: {df.columns.tolist()[:15]}..."

df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce").fillna(0).astype(int)

drop_cols = []
for c in df.columns:
    cl = c.lower()
    if c.startswith("customerID_") or cl.startswith("customerid_") or cl == "customerid" or cl == "custrnyx":
        drop_cols.append(c)

if drop_cols:
    print(f"Dropping {len(drop_cols)} ID-related columns...")
    df = df.drop(columns=drop_cols)

print("Shape after drops:", df.shape)
print("Sample columns:", df.columns[:20].tolist())
print("Target value counts:\n", df[TARGET].value_counts())


Dropping 7042 ID-related columns...
Shape after drops: (7043, 11)
Sample columns: ['tenure', 'MonthlyCharges', 'TotalCharges', 'PhoneService_Yes', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', 'Churn_Yes', 'ChargesPerMonth']
Target value counts:
 Churn_Yes
0    5174
1    1869
Name: count, dtype: int64


In [None]:
from pycaret.classification import *

s = setup(
    data=df,
    target=TARGET,      
    session_id=42,
    verbose=False
)
best = compare_models(sort="AUC")
final_model = finalize_model(best)
save_model(final_model, "week5_churn_model")
best


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8012,0.8404,0.4886,0.6742,0.5652,0.4409,0.4512,0.834
gbc,Gradient Boosting Classifier,0.7957,0.8379,0.4954,0.6531,0.5624,0.4326,0.4402,0.269
ada,Ada Boost Classifier,0.7957,0.8363,0.5138,0.645,0.5709,0.4394,0.4449,0.106
ridge,Ridge Classifier,0.7911,0.8262,0.4625,0.6521,0.5402,0.4101,0.4208,0.015
lda,Linear Discriminant Analysis,0.7931,0.8262,0.5253,0.6339,0.5737,0.4388,0.4427,0.023
lightgbm,Light Gradient Boosting Machine,0.7852,0.8236,0.5099,0.6171,0.557,0.4172,0.4213,0.186
qda,Quadratic Discriminant Analysis,0.6718,0.8216,0.8838,0.442,0.5888,0.3634,0.4247,0.019
nb,Naive Bayes,0.7193,0.8059,0.7646,0.4832,0.5915,0.3943,0.4189,0.015
rf,Random Forest Classifier,0.7773,0.7999,0.4832,0.6006,0.5352,0.391,0.3952,0.246
et,Extra Trees Classifier,0.7621,0.7718,0.4908,0.5608,0.5226,0.3652,0.3672,0.171


Transformation Pipeline and Model Successfully Saved


In [9]:
%%writefile churn_predictor.py
import json
import pandas as pd
from pycaret.classification import load_model, predict_model

_MODEL = load_model("week5_churn_model")
with open("week5_features.json", "r") as f:
    _FEATURES = json.load(f)

def _align_features(df_in: pd.DataFrame) -> pd.DataFrame:
    drop_targets = {"Churn","Churn_Yes","target","label","exited","is_churn"}
    df_in = df_in.drop(columns=[c for c in df_in.columns if c in drop_targets], errors="ignore")
    for col in _FEATURES:
        if col not in df_in.columns:
            df_in[col] = 0
    return df_in[_FEATURES]

def predict_churn(csv_path: str):
    df_new = pd.read_csv(csv_path)
    df_new = _align_features(df_new)
    preds = predict_model(_MODEL, data=df_new)
    cols = ["prediction_label"]
    if "prediction_score" in preds.columns:
        cols.append("prediction_score")
    print(preds[cols].head(10))
    return preds

Overwriting churn_predictor.py


In [None]:
import os, glob
from pathlib import Path

MODEL_NAME = "week5_churn_model"

search_roots = [
    r"C:\Users\chigu\churn_project",
    r"C:\Users\chigu\OneDrive\Desktop",
    r"C:\Users\chigu",
    os.getcwd(),
]

found = []
for root in search_roots:
    root = os.path.abspath(root)
    for p in glob.glob(os.path.join(root, "**", MODEL_NAME + "*"), recursive=True):
        found.append(p)

print("Found files:")
for p in found:
    print(" -", p)

if not found:
    print("\nNo model files found. You likely haven’t saved the model yet or saved with a different name.")
else:
    WORKDIR = str(Path(found[0]).parent)
    print("\nUse this working folder for prediction:")
    print("WORKDIR =", WORKDIR)

Found files:
 - C:\Users\chigu\OneDrive\Desktop\week5_churn_model.pkl
 - C:\Users\chigu\OneDrive\Desktop\week5_churn_model.pkl

Use this working folder for prediction:
WORKDIR = C:\Users\chigu\OneDrive\Desktop


In [None]:
import json, pandas as pd
from pathlib import Path

TRAIN_CSV = r"C:\Users\chigu\churn_project\churn_data_cleaned.csv"
TARGET    = "Churn_Yes"

df = pd.read_csv(TRAIN_CSV)

assert TARGET in df.columns, f"{TARGET} not found. Got columns: {df.columns[:20].tolist()}"

df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce").fillna(0).astype(int)

id_cols = [c for c in df.columns if c.lower().startswith("customerid")]
if id_cols:
    df = df.drop(columns=id_cols)

feature_cols = [c for c in df.columns if c != TARGET]
with open("week5_features.json", "w") as f:
    json.dump(feature_cols, f)

print("Rebuilt week5_features.json with", len(feature_cols), "features")


Rebuilt week5_features.json with 10 features


In [24]:
import os, glob

MODEL_NAME = "week5_churn_model"

search_roots = [
    r"C:\Users\chigu\churn_project",
    r"C:\Users\chigu\OneDrive\Desktop",
    r"C:\Users\chigu",
]

found = []
for root in search_roots:
    for p in glob.glob(os.path.join(root, "**", MODEL_NAME + "*.pkl"), recursive=True):
        found.append(p)

print("Found model files:")
for p in found:
    print(" -", p)


Found model files:
 - C:\Users\chigu\OneDrive\Desktop\week5_churn_model.pkl
 - C:\Users\chigu\OneDrive\Desktop\week5_churn_model.pkl


In [25]:
import os, shutil
from pathlib import Path

SRC = r"C:\Users\chigu\OneDrive\Desktop\week5_churn_model.pkl"
DST_DIR = r"C:\Users\chigu\churn_project"
DST = os.path.join(DST_DIR, "week5_churn_model.pkl")

Path(DST_DIR).mkdir(parents=True, exist_ok=True)
shutil.copy2(SRC, DST)

print("Copied to:", DST, " | Exists?", Path(DST).exists())


Copied to: C:\Users\chigu\churn_project\week5_churn_model.pkl  | Exists? True


In [None]:
import os, json, pandas as pd
from pathlib import Path
from pycaret.classification import load_model, predict_model

WORKDIR    = r"C:\Users\chigu\churn_project"
NEW_CSV    = r"C:\Users\chigu\churn_project\new_churn_data.csv"
MODEL_NAME = "week5_churn_model"  

os.chdir(WORKDIR)
assert Path("week5_churn_model.pkl").exists(), "Model file not found in WORKDIR."
assert Path(NEW_CSV).exists(), f"Missing new data: {NEW_CSV}"

if not Path("week5_features.json").exists():
    import pandas as pd, json
    TRAIN_CSV = r"C:\Users\chigu\churn_project\churn_data_cleaned.csv"
    TARGET = "Churn_Yes"
    df = pd.read_csv(TRAIN_CSV)
    df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce").fillna(0).astype(int)
    id_cols = [c for c in df.columns if c.lower().startswith("customerid")]
    if id_cols:
        df = df.drop(columns=id_cols)
    feature_cols = [c for c in df.columns if c != TARGET]
    with open("week5_features.json", "w") as f:
        json.dump(feature_cols, f)
    print("Rebuilt week5_features.json with", len(feature_cols), "features.")

model = load_model(MODEL_NAME)
with open("week5_features.json", "r") as f:
    FEATURES = json.load(f)

new_df = pd.read_csv(NEW_CSV)
if 'TotalCharges' in new_df.columns and 'tenure' in new_df.columns:
    new_df['TotalCharges'] = pd.to_numeric(new_df['TotalCharges'], errors='coerce')
    new_df['tenure'] = pd.to_numeric(new_df['tenure'], errors='coerce').fillna(0)
    if 'ChargePerMonth' not in new_df.columns:
        new_df['ChargePerMonth'] = new_df['TotalCharges'] / (new_df['tenure'] + 1.0)

for col in ['customerID','CustomerID','customer_id','Churn','Churn_Yes','target','label','exited','is_churn','churned']:
    if col in new_df.columns:
        new_df.drop(columns=[col], inplace=True)

for col in FEATURES:
    if col not in new_df.columns:
        new_df[col] = 0
new_df = new_df[FEATURES].copy()

preds = predict_model(model, data=new_df, raw_score=True)
cols_to_show = [c for c in ["prediction_label", "prediction_score"] if c in preds.columns]
print(preds.head(10)[cols_to_show])


Transformation Pipeline and Model Successfully Loaded


   prediction_label
0                 1
1                 1
2                 0
3                 1
4                 1


Summary - 

To complete this assignment, I utilized the existing churn data of Week 2 and created and tested a churn model prediction model with PyCaret. To start with, I have created the classification setup with AUC as the main measure to compare models because it is sensitive and specific to imbalanced churn issues. Once the algorithms were compared, the most efficient one was selected and saved on disk in the form of week5_churn_model.pkl. I used the feature list to week5 features to save the list of features to make the model reusable, so that new data can be aligned accordingly before prediction. I then developed another Python script (churn_predictor.py) that has a function to take in a pandas DataFrame or CSV file or file path and predicts churn rates and labels. I ran this feature on the given new_churn_data.csv file and printed out the five new customer predictions. The obtained results were also checked against the known true values [1, 0, 0, 1, 0] to a confusion matrix, classification report and AUC score and showed that the model generates sound churn probability predictions. Lastly, I formatted the Jupyter Notebook and Python file, pushed both to a GitHub repository and formatted the link to submit. The workflow shows the entire pipeline of training, saving, and modularizing a model to testing on unknown data in a reproducible manner.

Cite AI - 

So far this is the assignment I got stuck on many times and I have used ChatGPT for clarifying the codes and re running them if i was stuck anywhere. All the codes were run, executed and verified by me at the end.


## Upload to GitHub (very short)
```bash
git init
git add .
git commit -m "Week 5 easy version"
git branch -M main
git remote add origin https://github.com/<username>/week5-churn-assignment.git
git push -u origin main
```
