In [194]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

In [185]:
file = "Telco.csv"
df_telco = pd.read_csv(file)
df_telco.head()



Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [186]:
yes_no_columns = ["Partner", "Dependents", "PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PaperlessBilling", "Churn"]
for item in yes_no_columns:
    df_telco[item] = df_telco[item].map({'Yes': 1, 'No': 0}).astype("Int64")

#Assume that if the total charges are null, replace with zeros
df_telco["TotalCharges"] = pd.to_numeric(df_telco["TotalCharges"], errors='coerce')
df_telco.fillna(0, inplace=True)
df_telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,0,...,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,1,...,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,1,...,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,1,...,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1


In [187]:
y = df_telco["Churn"]
y = y.astype("int64")
X = df_telco.drop(["Churn", "customerID"], axis=1)
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,1,0,1,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85
1,Male,0,0,0,34,1,No,DSL,1,0,1,0,0,0,One year,0,Mailed check,56.95,1889.5
2,Male,0,0,0,2,1,No,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15
3,Male,0,0,0,45,0,No phone service,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75
4,Female,0,0,0,2,1,No,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65


In [188]:
category_columns = ["gender", "MultipleLines", "InternetService", "Contract", "PaymentMethod"]
X = pd.get_dummies(X, columns=category_columns)
X.replace([np.inf, -np.inf], 0, inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 29 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   SeniorCitizen                            7043 non-null   int64  
 1   Partner                                  7043 non-null   Int64  
 2   Dependents                               7043 non-null   Int64  
 3   tenure                                   7043 non-null   int64  
 4   PhoneService                             7043 non-null   Int64  
 5   OnlineSecurity                           7043 non-null   Int64  
 6   OnlineBackup                             7043 non-null   Int64  
 7   DeviceProtection                         7043 non-null   Int64  
 8   TechSupport                              7043 non-null   Int64  
 9   StreamingTV                              7043 non-null   Int64  
 10  StreamingMovies                          7043 no

In [189]:
# Hyperparameters
random_state = 42
n_estimators = 500
max_depth = 4
max_features = 3
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 42)
scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [190]:
clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features, bootstrap = True, random_state = random_state).fit(x_train, y_train)

In [191]:
# Create our predictions
prediction = clf.predict(x_test)
# Create confusion matrix
confusion_matrix(y_test, prediction)

array([[1228,   54],
       [ 333,  146]], dtype=int64)

In [192]:
# Display accuracy score
accuracy_score(y_test, prediction)# Display F1 score
f1_score(y_test,prediction)

0.43004418262150224