In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [3]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [4]:
df = df.drop("customerID",axis=1)
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [7]:
cols = ["Partner","Dependents","OnlineSecurity","PhoneService","MultipleLines","OnlineBackup" ,"DeviceProtection","TechSupport","StreamingTV","StreamingMovies","PaperlessBilling"]

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
y = df["Churn"].map({"Yes":1,"No":0})
x = df.drop("Churn",axis=1)

In [57]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [58]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# ------------------------------
# 1. CLEANING STEPS (same as your notebook)
# ------------------------------

# Fix blank TotalCharges
df["TotalCharges"] = df["TotalCharges"].replace(" ", np.nan)
df = df.dropna(subset=["TotalCharges"])
df["TotalCharges"] = df["TotalCharges"].astype(float)

# Merge "No phone service" & "No internet service" → "No"
service_cols = ["MultipleLines", "OnlineSecurity", "OnlineBackup",
                "DeviceProtection", "TechSupport", "StreamingTV",
                "StreamingMovies"]

for col in service_cols:
    df[col] = df[col].replace({"No phone service": "No",
                               "No internet service": "No"})

# Convert target to 0/1
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})


# ------------------------------
# 2. IDENTIFY FEATURE TYPES
# ------------------------------
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]

categorical_features = [
    "gender", "SeniorCitizen", "Partner", "Dependents",
    "PhoneService", "MultipleLines", "InternetService",
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies",
    "Contract", "PaperlessBilling", "PaymentMethod"
]

# ------------------------------
# 3. DEFINE TRANSFORMERS
# ------------------------------
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# ------------------------------
# 4. FINAL PREPROCESSING PIPELINE
# ------------------------------
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [59]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier     
#from lightgbm import LGBMClassifier

In [60]:
#pip install xgboost

In [61]:
models = {
    "Decision Tree" : DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "XGBoost":XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight = (len(y_train) / sum(y_train)),
    eval_metric="logloss"
)
}

In [63]:
y = y.dropna()

In [64]:
for name,model in models.items():
    model.fit(x_train,y_train)
    pred = model.predict(x_test)

    print(f"{name} - Accuracy : {accuracy_score(y_test,pred)}")

ValueError: Input y contains NaN.

In [48]:
from sklearn.metrics import classification_report, roc_auc_score


In [49]:
for name, model in models.items():
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    probs = model.predict_proba(x_test)[:,1]

    print("\n", name)
    print("ROC-AUC:", roc_auc_score(y_test, probs))
    print(classification_report(y_test, preds))



 Decision Tree
ROC-AUC: 0.6468828136728598
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      1033
           1       0.47      0.49      0.48       374

    accuracy                           0.72      1407
   macro avg       0.64      0.65      0.65      1407
weighted avg       0.72      0.72      0.72      1407


 Random Forest
ROC-AUC: 0.8177586180120204
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407


 Logistic Regression
ROC-AUC: 0.8318769380497073
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.62      0.52      0.56       374

    accuracy                           0.79 

In [50]:
import joblib

In [51]:
best_model = models["Logistic Regression"]
best_model.fit(x_train,y_train)

joblib.dump(best_model,"model.pkl")

['model.pkl']

In [53]:
joblib.dump(preprocessing_pipeline, "pipeline.pkl")


['pipeline.pkl']

array([[-0.44032709,  1.03561683, -0.65230493, ..., -0.5253508 ,
         1.40476387, -0.54360352],
       [-0.44032709, -0.9656081 , -0.65230493, ..., -0.5253508 ,
        -0.71186341,  1.83957601],
       [-0.44032709, -0.9656081 , -0.65230493, ..., -0.5253508 ,
        -0.71186341,  1.83957601],
       ...,
       [-0.44032709,  1.03561683,  1.53302536, ..., -0.5253508 ,
         1.40476387, -0.54360352],
       [ 2.27103902,  1.03561683, -0.65230493, ..., -0.5253508 ,
        -0.71186341,  1.83957601],
       [-0.44032709, -0.9656081 , -0.65230493, ..., -0.5253508 ,
        -0.71186341, -0.54360352]], shape=(7032, 27))