In [348]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [291]:
df = pd.read_csv("./datasets/Telco-Customer-Churn.csv")

In [292]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [293]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [294]:
df.drop(["customerID"], axis=1, inplace=True)

In [295]:
df["TotalCharges"] = df["TotalCharges"].replace(' ', 0.)

In [296]:
df = df.astype({"TotalCharges": "float64"})

In [297]:
df.iloc[:, 2:].select_dtypes(include=["object"])

Unnamed: 0,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No
1,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No
2,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes
3,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),No
4,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,No
7039,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),No
7040,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,No
7041,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,Yes


In [298]:
# Yes No Columns
binary_columns = ["Partner", "Dependents", "PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PaperlessBilling", "Churn"]

In [299]:
df[binary_columns].memory_usage(deep=True)

Index                  132
Partner             418939
Dependents          417647
PhoneService        421898
OnlineSecurity      443498
OnlineBackup        443908
DeviceProtection    443901
TechSupport         443523
StreamingTV         444186
StreamingMovies     444211
PaperlessBilling    419708
Churn               417406
dtype: int64

In [300]:
df[binary_columns] = df[binary_columns].replace({"Yes": 1, "No": 0})

  df[binary_columns] = df[binary_columns].replace({"Yes": 1, "No": 0})


In [301]:
df[binary_columns] = df[binary_columns].apply(lambda x: x.astype(bool))

In [302]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,True,False,1,False,No phone service,DSL,False,True,False,False,False,False,Month-to-month,True,Electronic check,29.85,29.85,False
1,Male,0,False,False,34,True,No,DSL,True,False,True,False,False,False,One year,False,Mailed check,56.95,1889.50,False
2,Male,0,False,False,2,True,No,DSL,True,True,False,False,False,False,Month-to-month,True,Mailed check,53.85,108.15,True
3,Male,0,False,False,45,False,No phone service,DSL,True,False,True,True,False,False,One year,False,Bank transfer (automatic),42.30,1840.75,False
4,Female,0,False,False,2,True,No,Fiber optic,False,False,False,False,False,False,Month-to-month,True,Electronic check,70.70,151.65,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,True,True,24,True,Yes,DSL,True,False,True,True,True,True,One year,True,Mailed check,84.80,1990.50,False
7039,Female,0,True,True,72,True,Yes,Fiber optic,False,True,True,False,True,True,One year,True,Credit card (automatic),103.20,7362.90,False
7040,Female,0,True,True,11,False,No phone service,DSL,True,False,False,False,False,False,Month-to-month,True,Electronic check,29.60,346.45,False
7041,Male,1,True,False,4,True,Yes,Fiber optic,False,False,False,False,False,False,Month-to-month,True,Mailed check,74.40,306.60,True


In [303]:
df["SeniorCitizen"] = df["SeniorCitizen"].astype("bool")

In [304]:
df[binary_columns].memory_usage(deep=True)
# reduction in memory is achieved.

Index                132
Partner             7043
Dependents          7043
PhoneService        7043
OnlineSecurity      7043
OnlineBackup        7043
DeviceProtection    7043
TechSupport         7043
StreamingTV         7043
StreamingMovies     7043
PaperlessBilling    7043
Churn               7043
dtype: int64

In [305]:
df["gender"].memory_usage(deep=True)

436731

In [306]:
df["gender"] = df["gender"].map({"Male": 0, "Female": 1}).astype("bool")
# True means Female and False means Male

In [307]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [308]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   bool   
 1   SeniorCitizen     7043 non-null   bool   
 2   Partner           7043 non-null   bool   
 3   Dependents        7043 non-null   bool   
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   bool   
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   bool   
 9   OnlineBackup      7043 non-null   bool   
 10  DeviceProtection  7043 non-null   bool   
 11  TechSupport       7043 non-null   bool   
 12  StreamingTV       7043 non-null   bool   
 13  StreamingMovies   7043 non-null   bool   
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   bool   
 16  PaymentMethod     7043 non-null   object 


In [309]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [310]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
bool_features = X.select_dtypes(include=["bool"]).columns.tolist()
object_features = X.select_dtypes(include=["object"]).columns.tolist()

In [311]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", StandardScaler(), numeric_features),
        ("object", OneHotEncoder(handle_unknown="ignore"), object_features),
        ("bool", "passthrough", bool_features)
    ]
)

In [333]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

In [359]:
param_grid = [
    {
        "classifier": [LogisticRegression(max_iter=1000)],
        "classifier__C": [1],
        "classifier__solver": ["liblinear", "lbfgs"]
    },
    {
        "classifier": [RandomForestClassifier()],
        "classifier__n_estimators": [100, 200, 250],
        "classifier__max_depth": [5, 10]
    },
    {
        "classifier": [GradientBoostingClassifier()],
        "classifier__n_estimators": [200, 250, 300],
        "classifier__learning_rate": [0.01, 0.1],
        "classifier__max_depth": [3, 5]
    },
    {
        "classifier": [SVC()],
        "classifier__C": [0.1, 1, 10],
        "classifier__kernel": ["linear", "rbf"]
    }
]

In [360]:
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring="f1")
grid.fit(X_train, y_train)

In [362]:
y_pred = grid.best_estimator_.predict(X_test)

In [371]:
print(f"Best Estimator based on F1 Score")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Best Estimator based on F1 Score
Accuracy: 0.7898722195929957
              precision    recall  f1-score   support

       False       0.84      0.88      0.86      1569
        True       0.61      0.52      0.56       544

    accuracy                           0.79      2113
   macro avg       0.72      0.70      0.71      2113
weighted avg       0.78      0.79      0.78      2113

[[1386  183]
 [ 261  283]]


In [382]:
results = pd.DataFrame(grid.cv_results_)
results.sort_values("mean_test_score", ascending=False)[["param_classifier", "mean_test_score"]].reset_index(drop=True)

Unnamed: 0,param_classifier,mean_test_score
0,LogisticRegression(max_iter=1000),0.61718
1,LogisticRegression(max_iter=1000),0.616383
2,GradientBoostingClassifier(),0.59871
3,SVC(),0.597383
4,SVC(),0.596646
5,GradientBoostingClassifier(),0.595577
6,GradientBoostingClassifier(),0.595424
7,SVC(),0.594868
8,SVC(),0.593588
9,RandomForestClassifier(),0.585446
