## Loading the dataset 

In [33]:
import pandas as pd
import numpy as np

In [34]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [35]:
df.shape

(7043, 21)

In [36]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [38]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Preprocessing

In [39]:
df.replace(r'^\s*$', pd.NA, regex=True, inplace=True)

In [40]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [41]:
df["TotalCharges"] = df["TotalCharges"].fillna(0)

In [42]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [44]:
df["Churn"].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [45]:
df["Churn"] = df["Churn"].replace({"No":0, "Yes": 1})

In [46]:
df["Churn"].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [47]:
X = df.drop(columns = ["Churn", "customerID"])
y = df[["Churn"]]

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [49]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
               'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
               'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

## Feature Engineering

In [50]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_train_numerical = scaler.fit_transform(X_train[numerical])
scaled_test_numerical = scaler.transform(X_test[numerical])

scaled_train_numerical_df = pd.DataFrame(scaled_train_numerical, columns=numerical, index=X_train.index)
scaled_test_numerical_df = pd.DataFrame(scaled_test_numerical, columns=numerical, index=X_test.index)

In [51]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output = False).set_output(transform = "pandas")
encoded_train_categorical = ohe.fit_transform(X_train[categorical])
encoded_test_categorical = ohe.transform(X_test[categorical])

In [52]:
X_train_non_numerical_categorical = X_train.drop(columns=numerical + categorical)
X_test_non_numerical_categorical = X_test.drop(columns=numerical + categorical)

# Combine the scaled numerical and one-hot encoded categorical features with the rest of the DataFrames
X_train_processed = pd.concat([X_train_non_numerical_categorical, scaled_train_numerical_df, encoded_train_categorical], axis=1)
X_test_processed = pd.concat([X_test_non_numerical_categorical, scaled_test_numerical_df, encoded_test_categorical], axis=1)

## Model Training

In [55]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [56]:
models={
    "Random Forest": RandomForestClassifier(random_state=1),
    "Extra Trees": ExtraTreesClassifier(random_state=1),
    "XGBoost": XGBClassifier(random_state=1),
    "LightGBM": LGBMClassifier(random_state=1)
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_processed, y_train.values.ravel())  # Train Model
    # Make predictions
    y_pred = model.predict(X_test_processed)


  # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_pred)  

    print(list(models.keys())[i])

    print('Model performance for Test set (Hypertension)')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy) )
   

    print('='*35)
    print('\n')

Random Forest
Model performance for Test set (Hypertension)
- Accuracy: 0.7913


Extra Trees
Model performance for Test set (Hypertension)
- Accuracy: 0.7672


XGBoost
Model performance for Test set (Hypertension)
- Accuracy: 0.7935


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002386 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM
Model performance for Test set (Hypertension)
- Accuracy: 0.8034




In [57]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

etc = ExtraTreesClassifier(random_state=1)

cv = RandomizedSearchCV(estimator = etc,
                               param_distributions = hyperparameter_grid,
                               cv = 5, n_iter = 10, scoring = 'accuracy',
                               n_jobs = -1, verbose = 1, 
                               random_state = 1)
cv.fit(X_train_processed, y_train.values.ravel())

best_params = cv.best_params_
print(f"Best parameters found: {best_params}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\akins\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\akins\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\akins\anaconda3\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\akins\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Best parameters found: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [61]:
new_etc_model = ExtraTreesClassifier(n_estimators = best_params['n_estimators'],
                                 min_samples_split = best_params['min_samples_split'],
                                 min_samples_leaf = best_params['min_samples_leaf'],
                                 max_features = best_params['max_features'],
                                 random_state = 1)

new_etc_model.fit(X_train_processed, y_train.values.ravel())
new_etc_model_accuracy = new_etc_model.score(X_test_processed, y_test)

old_etc_model = ExtraTreesClassifier(random_state=1)
old_etc_model.fit(X_train_processed, y_train.values.ravel())
old_etc_model_accuracy = old_etc_model.score(X_test_processed, y_test)

print(f"Old Extra Trees model accuracy: {old_etc_model_accuracy}")
print(f"New Extra Trees model accuracy: {new_etc_model_accuracy}")

Old Extra Trees model accuracy: 0.7672107877927609
New Extra Trees model accuracy: 0.8041163946061036


In [63]:
best_model = cv.best_estimator_

best_model.fit(X_train_processed, y_train.values.ravel())

feature_importances = best_model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train_processed.columns,
    'Importance': feature_importances
})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df.iloc[:2])

                    Feature  Importance
37  Contract_Month-to-month    0.152237
0                    tenure    0.092800
