In [3]:
import pandas as pd
#telco customer churn data
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [4]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.shape

(7043, 21)

In [6]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [7]:
#unique identifier not needed
data = df.drop('customerID', axis=1) 

In [8]:
#convert
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")
data["TotalCharges"] = data["TotalCharges"].fillna(data["TotalCharges"].median())

#encode
yes_no_cols = ["Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]
for col in yes_no_cols:
    data[col] = data[col].map({"Yes":1, "No":0})
    
#one-hot encode multi category features
multi_cats = [
    "gender", "MultipleLines", "InternetService", "OnlineSecurity",
    "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV",
    "StreamingMovies", "Contract", "PaymentMethod"
]

data = pd.get_dummies(data, columns=multi_cats)

#bin continuous features
data["tenure_bin"] = pd.cut(data["tenure"], bins=[0, 6, 12, 24, 48, 72], include_lowest=True, labels=False)
data["MonthlyCharges_bin"] = pd.qcut(data["MonthlyCharges"], q=4, duplicates="drop", labels=False)
data["TotalCharges_bin"] = pd.qcut(data["TotalCharges"], q=4, duplicates="drop", labels=False)

data = data.drop(columns=["tenure", "MonthlyCharges", "TotalCharges"])


In [9]:
data.head(5)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn,gender_Female,gender_Male,MultipleLines_No,MultipleLines_No phone service,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_bin,MonthlyCharges_bin,TotalCharges_bin
0,0,1,0,0,1,0,True,False,False,True,...,True,False,False,False,False,True,False,0,0,0
1,0,0,0,1,0,0,False,True,True,False,...,False,True,False,False,False,False,True,3,1,2
2,0,0,0,1,1,1,False,True,True,False,...,True,False,False,False,False,False,True,0,1,0
3,0,0,0,0,0,0,False,True,False,True,...,False,True,False,True,False,False,False,3,1,2
4,0,0,0,1,1,1,True,False,True,False,...,True,False,False,False,False,True,False,0,2,0


In [10]:
from sklearn.model_selection import train_test_split

X = data.drop("Churn", axis=1)
y = data["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [11]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB(alpha=1.0)
model_nb.fit(X_train, y_train)
pred_nb = model_nb.predict(X_test)
prob_nb = model_nb.predict_proba(X_test)[:,1]


In [17]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, pred_nb)
print(accuracy)

0.7324343506032647


In [13]:
df["tenure"].unique()

array([ 1, 34,  2, 45,  8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, 71,
       21, 12, 30, 47, 72, 17, 27,  5, 46, 11, 70, 63, 43, 15, 60, 18, 66,
        9,  3, 31, 50, 64, 56,  7, 42, 35, 48, 29, 65, 38, 68, 32, 55, 37,
       36, 41,  6,  4, 33, 67, 23, 57, 61, 14, 20, 53, 40, 59, 24, 44, 19,
       54, 51, 26,  0, 39])

In [14]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, pred_nb))
print("ROC-AUC:", roc_auc_score(y_test, prob_nb))


              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1035
           1       0.50      0.81      0.62       374

    accuracy                           0.73      1409
   macro avg       0.70      0.76      0.71      1409
weighted avg       0.80      0.73      0.75      1409

ROC-AUC: 0.8187876204500245


In [15]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred_nb)
tn, fp, fn, tp = cm.ravel()
cm



array([[729, 306],
       [ 71, 303]])