# Importing libraries and Loading Data

In [23]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV



Customer = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
Customer.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [24]:
Customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# Preprocessing

In [25]:
# Convert 'TotalCharges' to numeric, coerce errors, and fill missing values with 0
Customer['TotalCharges'] = pd.to_numeric(Customer['TotalCharges'], errors='coerce').fillna(0)

# Convert 'Churn' column to binary values
Customer['Churn'] = Customer['Churn'].map({'No': 0, 'Yes': 1})

# Split the data into an 80-20 train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=1)

# Select features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
               'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
               'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
               'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Feature engineering

In [26]:
# Scale numerical features
scaler = StandardScaler()
train_numerical_scaled = pd.DataFrame(scaler.fit_transform(train_data[numerical]), columns=numerical)
test_numerical_scaled = pd.DataFrame(scaler.transform(test_data[numerical]), columns=numerical)


In [27]:
# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, drop='first')
train_categorical_encoded = pd.DataFrame(encoder.fit_transform(train_data[categorical]), columns=encoder.get_feature_names_out(categorical))
test_categorical_encoded = pd.DataFrame(encoder.transform(test_data[categorical]), columns=encoder.get_feature_names_out(categorical))


In [28]:
# Combine scaled numerical and one-hot encoded categorical features
X_train = pd.concat([train_numerical_scaled, train_categorical_encoded], axis=1)
X_test = pd.concat([test_numerical_scaled, test_categorical_encoded], axis=1)
y_train = train_data['Churn']
y_test = test_data['Churn']


# Model training and evaluation

In [29]:
# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("Random Forest Classifier:")
print(f"Accuracy: {accuracy_score(y_test, rf_predictions)}")
print(classification_report(y_test, rf_predictions))


Random Forest Classifier:
Accuracy: 0.8005677785663591
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1061
           1       0.61      0.55      0.58       348

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.80      0.80      0.80      1409



In [30]:
# Extra Trees Classifier
et_model = ExtraTreesClassifier(random_state=1)
et_model.fit(X_train, y_train)
et_predictions = et_model.predict(X_test)
print("Extra Trees Classifier:")
print(f"Accuracy: {accuracy_score(y_test, et_predictions)}")
print(classification_report(y_test, et_predictions))

Extra Trees Classifier:
Accuracy: 0.7828246983676366
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1061
           1       0.57      0.51      0.54       348

    accuracy                           0.78      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.78      0.78      0.78      1409



In [31]:
# XGBoost Classifier
xgb_model = xgb.XGBClassifier(random_state=1)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
print("XGBoost Classifier:")
print(f"Accuracy: {accuracy_score(y_test, xgb_predictions)}")
print(classification_report(y_test, xgb_predictions))


XGBoost Classifier:
Accuracy: 0.7991483321504613
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1061
           1       0.60      0.57      0.58       348

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.80      0.80      0.80      1409



In [32]:
# LightGBM Classifier
lgb_model = lgb.LGBMClassifier(random_state=1)
lgb_model.fit(X_train, y_train)
lgb_predictions = lgb_model.predict(X_test)
print("LightGBM Classifier:")
print(f"Accuracy: {accuracy_score(y_test, lgb_predictions)}")
print(classification_report(y_test, lgb_predictions))


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM Classifier:
Accuracy: 0.8147622427253371
              precision    recall  f1-score   support

           0       0.86      0.89      0.88      1061
           1       0.64      0.57      0.61       348

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.81      0.81      0.81      1409



In [33]:
# Define ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=1)

# Define hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Define RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=etc, param_distributions=hyperparameter_grid,
                                   n_iter=10, scoring='accuracy', cv=5, n_jobs=-1,
                                   verbose=1, random_state=1)

# Perform RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\PC\.conda\envs\PYTON\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\PC\.conda\envs\PYTON\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\PC\.conda\envs\PYTON\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\PC\.conda\envs\PYTON\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    ra

Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


In [21]:
# Create ExtraTreesClassifier with the best hyperparameters
best_etc = ExtraTreesClassifier(random_state=1, **best_params)

# Train the model on the training data
best_etc.fit(X_train, y_train)

# Predict on the test data
best_etc_predictions = best_etc.predict(X_test)

# Calculate accuracy of the new model
best_etc_accuracy = accuracy_score(y_test, best_etc_predictions)

# Compare with the accuracy of the initial ExtraTreesClassifier model
initial_etc_accuracy = accuracy_score(y_test, et_predictions)

if best_etc_accuracy > initial_etc_accuracy:
    print("The accuracy of the new optimal model is higher than the initial ExtraTreesClassifier model with no hyperparameter tuning.")
elif best_etc_accuracy < initial_etc_accuracy:
    print("The accuracy of the new optimal model is lower than the initial ExtraTreesClassifier model with no hyperparameter tuning.")
else:
    print("The accuracy of the new optimal model is the same as the initial ExtraTreesClassifier model with no hyperparameter tuning.")


The accuracy of the new optimal model is higher than the initial ExtraTreesClassifier model with no hyperparameter tuning.


In [None]:
feature_importances = best_etc.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

top_two_features = feature_importance_df.iloc[:2]['Feature'].tolist()

print("The two most important features are:", top_two_features)