In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer




In [None]:
"""
Bringing it together:

- Try to find out why customers churn
- Can we predict which customers will churn?
- Hint: The below can be useful
     Coefficients from logistic models
     Feature importance from tree based models

"""

In [65]:
cust_df = pd.read_csv("./data/anon_cust_data.csv")

In [66]:
cust_df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,1,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,2,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,3,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,4,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [67]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        7043 non-null   int64  
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [68]:
cust_df.isnull().sum()

Unnamed: 0          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [69]:
cust_df = cust_df.drop("Unnamed: 0", axis=1)

cust_df["TotalCharges"] = pd.to_numeric(cust_df["TotalCharges"], errors="coerce")

cust_df["Churn"] = cust_df["Churn"].map({"Yes": 1, "No": 0})



In [70]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [71]:
X = cust_df.drop("Churn", axis=1)
y =  cust_df["Churn"]

In [72]:
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [73]:
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [74]:
# Separating categorical and numerical columns 

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns

In [75]:
cat_cols

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [76]:
num_cols

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [None]:
# data splitting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Pipeline for cleaning and preparing numeric columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [None]:
# Pipeline for cleaning and converting categorical string/text columns

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [80]:
# Preparing categorical and numerical data before modeling

preprocess = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [81]:
# Preparing a logistic regression model pipeline 

log_model = Pipeline([
    ("prep", preprocess),
    ("logreg", LogisticRegression(max_iter=1000))
])

log_model.fit(X_train, y_train)

In [None]:
# X["gender"] = X["gender"].map({"Female": 1, "Male": 0})
# X["Partner"] = X["Partner"].map({"Yes": 1, "No": 0})
# X["Dependents"] = X["Dependents"].map({"Yes": 1, "No": 0})
# X["PhoneService"] = X["PhoneService"].map({"Yes": 1, "No": 0})
# X["MultipleLines"] = X["MultipleLines"].map({"Yes": 1, "No": 0, "No phone service": 0})
# X["InternetService"] = X["InternetService"].map({"DSL": 1, "Fiber optic": 2, "No": 0})
# X["OnlineSecurity"] = X["OnlineSecurity"].map({"Yes": 1, "No": 0, "No internet service": 0})
# X["OnlineBackup"] = X["OnlineBackup"].map({"Yes": 1, "No": 0, "No internet service": 0})
# X["DeviceProtection"] = X["DeviceProtection"].map({"Yes": 1, "No": 0, "No internet service": 0})
# X["TechSupport"] = X["TechSupport"].map({"Yes": 1, "No": 0, "No internet service": 0})
# X["StreamingTV"] = X["StreamingTV"].map({"Yes": 1, "No": 0, "No internet service": 0})
# X["StreamingMovies"] = X["StreamingMovies"].map({"Yes": 1, "No": 0, "No internet service": 0})
# X["Contract"] = X["Contract"].map({"Month-to-month": 0, "One year": 1, "Two year": 2})
# X["PaperlessBilling"] = X["PaperlessBilling"].map({"Yes": 1, "No": 0})
# X["PaymentMethod"] = X["PaymentMethod"].map({"Electronic check": 0, "Mailed check": 1, "Bank transfer (automatic)": 2, "Credit card (automatic)": 3})

In [None]:
# To display most important features

feature_names = log_model.named_steps["prep"].get_feature_names_out()
coefs = log_model.named_steps["logreg"].coef_[0]

coef_df = pd.DataFrame({"feature": feature_names, "coef": coefs}).sort_values("coef", ascending=False).sort_values("coef", ascending=False)

coef_df.head(20)

Unnamed: 0,feature,coef
10,cat__InternetService_Fiber optic,1.199218
3,num__TotalCharges,0.513253
21,cat__StreamingTV_Yes,0.383225
23,cat__StreamingMovies_Yes,0.382622
28,cat__PaymentMethod_Electronic check,0.381086
26,cat__PaperlessBilling_Yes,0.372843
9,cat__MultipleLines_Yes,0.36475
29,cat__PaymentMethod_Mailed check,0.072377
0,num__SeniorCitizen,0.054617
17,cat__DeviceProtection_Yes,0.037983


In [None]:
rf_model = Pipeline([
    ("prep", preprocess),
    ("rf", RandomForestClassifier(n_estimators=300, random_state=42))
])

rf_model.fit(X_train, y_train)

In [None]:
importances = rf_model.named_steps["rf"].feature_importances_

feat_imp_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

feat_imp_df.head(20)

In [None]:
plt.figure(figsize=(12, 8))
X_train.hist(bins=30, figsize=(12, 8))
plt.suptitle("Distributions of Numeric Training Predictors")
plt.tight_layout()


In [None]:
plt.figure(figsize=(12, 8))
X_test.hist(bins=30, figsize=(12, 8))
plt.suptitle("Distributions of Numeric Test Predictors")
plt.tight_layout()


In [None]:
plt.figure(figsize=(12, 8))
y_train.hist(bins=30, figsize=(12, 8))
plt.suptitle("Distributions of Target Feature")
plt.tight_layout()

In [None]:
cat_cols_copy = X_train.copy()
cat_cols_copy = cat_cols_copy.drop(['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen'], axis=1)

for col in cat_cols_copy:
    plt.figure(figsize=(6,4))
    cust_df[col].value_counts().plot(kind="bar")
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [None]:
cat_cols_copy.info()