In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [4]:
df = pd.read_csv("./datasets/Telco-Customer-Churn.csv")

In [5]:
# learn about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [7]:
# drop unnecessary columns
df.drop(["customerID"], axis=1, inplace=True)

In [8]:
# Preprocess columns and handle missing values
df["TotalCharges"] = df["TotalCharges"].replace(' ', 0.)

In [9]:
df = df.astype({"TotalCharges": "float64"})

In [10]:
df.iloc[:, 2:].select_dtypes(include=["object"])

Unnamed: 0,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No
1,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No
2,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes
3,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),No
4,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,No
7039,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),No
7040,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,No
7041,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,Yes


In [11]:
# Yes No Columns
binary_columns = ["Partner", "Dependents", "PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PaperlessBilling", "Churn"]

In [12]:
df[binary_columns].memory_usage(deep=True)

Index                  132
Partner             418939
Dependents          417647
PhoneService        421898
OnlineSecurity      443498
OnlineBackup        443908
DeviceProtection    443901
TechSupport         443523
StreamingTV         444186
StreamingMovies     444211
PaperlessBilling    419708
Churn               417406
dtype: int64

In [13]:
df[binary_columns] = df[binary_columns].replace({"Yes": 1, "No": 0})

  df[binary_columns] = df[binary_columns].replace({"Yes": 1, "No": 0})


In [14]:
# turn the binary columns into bool
df[binary_columns] = df[binary_columns].apply(lambda x: x.astype(bool))

In [15]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,True,False,1,False,No phone service,DSL,False,True,False,False,False,False,Month-to-month,True,Electronic check,29.85,29.85,False
1,Male,0,False,False,34,True,No,DSL,True,False,True,False,False,False,One year,False,Mailed check,56.95,1889.50,False
2,Male,0,False,False,2,True,No,DSL,True,True,False,False,False,False,Month-to-month,True,Mailed check,53.85,108.15,True
3,Male,0,False,False,45,False,No phone service,DSL,True,False,True,True,False,False,One year,False,Bank transfer (automatic),42.30,1840.75,False
4,Female,0,False,False,2,True,No,Fiber optic,False,False,False,False,False,False,Month-to-month,True,Electronic check,70.70,151.65,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,True,True,24,True,Yes,DSL,True,False,True,True,True,True,One year,True,Mailed check,84.80,1990.50,False
7039,Female,0,True,True,72,True,Yes,Fiber optic,False,True,True,False,True,True,One year,True,Credit card (automatic),103.20,7362.90,False
7040,Female,0,True,True,11,False,No phone service,DSL,True,False,False,False,False,False,Month-to-month,True,Electronic check,29.60,346.45,False
7041,Male,1,True,False,4,True,Yes,Fiber optic,False,False,False,False,False,False,Month-to-month,True,Mailed check,74.40,306.60,True


In [16]:
df["SeniorCitizen"] = df["SeniorCitizen"].astype("bool")

In [17]:
df[binary_columns].memory_usage(deep=True)
# reduction in memory is achieved.

Index                132
Partner             7043
Dependents          7043
PhoneService        7043
OnlineSecurity      7043
OnlineBackup        7043
DeviceProtection    7043
TechSupport         7043
StreamingTV         7043
StreamingMovies     7043
PaperlessBilling    7043
Churn               7043
dtype: int64

In [18]:
df["gender"].memory_usage(deep=True)

436731

In [19]:
df["gender"] = df["gender"].map({"Male": 0, "Female": 1}).astype("bool")
# True means Female and False means Male

In [20]:
# Create the features and targets
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   bool   
 1   SeniorCitizen     7043 non-null   bool   
 2   Partner           7043 non-null   bool   
 3   Dependents        7043 non-null   bool   
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   bool   
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   bool   
 9   OnlineBackup      7043 non-null   bool   
 10  DeviceProtection  7043 non-null   bool   
 11  TechSupport       7043 non-null   bool   
 12  StreamingTV       7043 non-null   bool   
 13  StreamingMovies   7043 non-null   bool   
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   bool   
 16  PaymentMethod     7043 non-null   object 


In [22]:
# Create train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73, stratify=y)

In [23]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
bool_features = X.select_dtypes(include=["bool"]).columns.tolist()
object_features = X.select_dtypes(include=["object"]).columns.tolist()

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", StandardScaler(), numeric_features),
        ("object", OneHotEncoder(handle_unknown="ignore"), object_features),
        ("bool", "passthrough", bool_features)
    ]
)

In [25]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

In [26]:
param_grid = [
    {
        "classifier": [LogisticRegression(class_weight="balanced")],
        "classifier__C": [1],
        "classifier__solver": ["liblinear"]
    },
    {
        "classifier": [RandomForestClassifier(class_weight="balanced")],
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [5, 10]
    },
    {
        "classifier": [GradientBoostingClassifier()],
        "classifier__n_estimators": [200, 250],
        "classifier__learning_rate": [0.1],
        "classifier__max_depth": [3, 5]
    },
    {
        "classifier": [SVC(class_weight="balanced")],
        "classifier__C": [0.1, 1],
        "classifier__kernel": ["linear"]
    }
]

In [27]:
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring="f1")
grid.fit(X_train, y_train)

In [28]:
y_pred = grid.best_estimator_.predict(X_test)

In [29]:
print(f"Best Estimator based on F1 Score")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Best Estimator based on F1 Score
Accuracy: 0.7395315826827538
F1 Score: 0.6157068062827226
              precision    recall  f1-score   support

       False       0.90      0.72      0.80      1035
        True       0.51      0.79      0.62       374

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.80      0.74      0.75      1409

[[748 287]
 [ 80 294]]


In [30]:
# find out which features are more important and eliminate the ones with least impact
feature_importances = grid.best_estimator_["classifier"].coef_[0]

In [31]:
feature_names = grid.best_estimator_[:-1].get_feature_names_out()

In [32]:
coef_df = pd.DataFrame({
    "Feature": feature_names,
    "Feature Importance": feature_importances,
    "ABS(Feature Importance)": abs(feature_importances)
}).sort_values(by="ABS(Feature Importance)", ascending=False).reset_index(drop=True)

In [33]:
coef_df["Feature"].head(10)
# Now train the model only with this features.

new_features = ["tenure", "Contract", "TotalCharges", "InternetService", "OnlineSecurity", "PaperlessBilling", "TechSupport", "PhoneService"]

In [34]:
new_X = df[new_features]

In [35]:
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, y, test_size=0.2, random_state=73, stratify=y)

In [36]:
numeric_features = new_X.select_dtypes(include=["int64", "float64"]).columns.tolist()
bool_features = new_X.select_dtypes(include=["bool"]).columns.tolist()
object_features = new_X.select_dtypes(include=["object"]).columns.tolist()

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", StandardScaler(), numeric_features),
        ("object", OneHotEncoder(handle_unknown="ignore"), object_features),
        ("bool", "passthrough", bool_features)
    ]
)

In [38]:
new_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [39]:
new_param_grid = [
    {
        "classifier": [LogisticRegression(class_weight="balanced")],
        "classifier__penalty": ["l2"],  # type of regularization
        "classifier__C": [0.01, 0.1, 1, 10, 100],
        "classifier__solver": ["liblinear", "lbfgs"]
    },
    {
        "classifier": [RandomForestClassifier(class_weight="balanced")],
        "classifier__n_estimators": [100, 200, 300],
        "classifier__max_depth": [5, 8, 10, 15]
    },
    {
        "classifier": [GradientBoostingClassifier()],
        "classifier__n_estimators": [200, 250],
        "classifier__learning_rate": [0.1, 0.5],
        "classifier__max_depth": [3, 5, 10]
    },
    {
        "classifier": [SVC(class_weight="balanced")],
        "classifier__C": [0.1, 1],
        "classifier__kernel": ["linear"]
    }
]

In [40]:
grid = GridSearchCV(new_pipeline, new_param_grid, cv=5, scoring="f1")
grid.fit(new_X_train, new_y_train)

In [41]:
new_y_pred = grid.best_estimator_.predict(new_X_test)

In [42]:
print(f"Best Estimator based on F1 Score")
print("Accuracy:", accuracy_score(new_y_test, new_y_pred))
print("F1 Score:", f1_score(new_y_test, new_y_pred))
print(classification_report(new_y_test, new_y_pred))
print(confusion_matrix(new_y_test, new_y_pred))

Best Estimator based on F1 Score
Accuracy: 0.7743080198722498
F1 Score: 0.6410835214446953
              precision    recall  f1-score   support

       False       0.90      0.78      0.84      1035
        True       0.55      0.76      0.64       374

    accuracy                           0.77      1409
   macro avg       0.73      0.77      0.74      1409
weighted avg       0.81      0.77      0.78      1409

[[807 228]
 [ 90 284]]


There is slighltly an improvement

In [58]:
cv_df = pd.DataFrame(grid.cv_results_)

In [60]:
cv_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__penalty,param_classifier__solver,param_classifier__max_depth,param_classifier__n_estimators,...,param_classifier__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009292,0.003128,0.004089,0.001991,LogisticRegression(class_weight='balanced'),0.01,l2,liblinear,,,...,,{'classifier': LogisticRegression(class_weight...,0.623545,0.633288,0.630769,0.621053,0.60184,0.622099,0.011081,13
1,0.004813,0.000726,0.002008,0.000155,LogisticRegression(class_weight='balanced'),0.01,l2,lbfgs,,,...,,{'classifier': LogisticRegression(class_weight...,0.627907,0.626016,0.633075,0.619236,0.599736,0.621194,0.011608,14
2,0.005098,7.4e-05,0.001938,3e-05,LogisticRegression(class_weight='balanced'),0.1,l2,liblinear,,,...,,{'classifier': LogisticRegression(class_weight...,0.629962,0.629183,0.629583,0.613368,0.599212,0.620262,0.012257,16
3,0.005887,0.000224,0.00195,3.5e-05,LogisticRegression(class_weight='balanced'),0.1,l2,lbfgs,,,...,,{'classifier': LogisticRegression(class_weight...,0.629962,0.630027,0.63038,0.613368,0.599476,0.620643,0.012415,15
4,0.005248,0.000101,0.001943,2e-05,LogisticRegression(class_weight='balanced'),1.0,l2,liblinear,,,...,,{'classifier': LogisticRegression(class_weight...,0.629299,0.638411,0.627204,0.627249,0.60129,0.624691,0.012409,11


In [56]:
results = pd.DataFrame(grid.cv_results_)
(results.sort_values(by="mean_test_score", ascending=False)
    [["param_classifier", "param_classifier__max_depth", "param_classifier__n_estimators", "mean_test_score"]].head(10))

Unnamed: 0,param_classifier,param_classifier__max_depth,param_classifier__n_estimators,mean_test_score
14,RandomForestClassifier(class_weight='balanced'),8.0,200.0,0.630011
10,RandomForestClassifier(class_weight='balanced'),5.0,100.0,0.628886
15,RandomForestClassifier(class_weight='balanced'),8.0,300.0,0.626628
12,RandomForestClassifier(class_weight='balanced'),5.0,300.0,0.626305
6,LogisticRegression(class_weight='balanced'),,,0.62569
7,LogisticRegression(class_weight='balanced'),,,0.625534
8,LogisticRegression(class_weight='balanced'),,,0.625529
13,RandomForestClassifier(class_weight='balanced'),8.0,100.0,0.625413
9,LogisticRegression(class_weight='balanced'),,,0.625204
5,LogisticRegression(class_weight='balanced'),,,0.625022


In [75]:
(results.sort_values(by="mean_fit_time", ascending=False)
    [["param_classifier", "param_classifier__max_depth", "param_classifier__n_estimators", "mean_fit_time"]]
)

Unnamed: 0,param_classifier,param_classifier__max_depth,param_classifier__n_estimators,mean_fit_time
27,GradientBoostingClassifier(),10.0,250.0,1.96368
33,GradientBoostingClassifier(),10.0,250.0,1.942193
26,GradientBoostingClassifier(),10.0,200.0,1.640995
32,GradientBoostingClassifier(),10.0,200.0,1.556067
25,GradientBoostingClassifier(),5.0,250.0,0.86818
31,GradientBoostingClassifier(),5.0,250.0,0.854686
24,GradientBoostingClassifier(),5.0,200.0,0.713975
30,GradientBoostingClassifier(),5.0,200.0,0.683216
23,GradientBoostingClassifier(),3.0,250.0,0.55377
29,GradientBoostingClassifier(),3.0,250.0,0.531175


- However it is obvious that the Random Forest Classifier has the second longest **mean_fit_time** among all classifiers.
- It takes longer to train the Random Forest Classifier, but it gives slightly better model compared to the old Logistic Regression model.