In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from IPython.display import HTML
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRFClassifier

In [2]:
filepath = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(filepath)


In [3]:
categorical = ['gender', 'SeniorCitizen', 'Partner',
               'Dependents', 'PhoneService', 
               'MultipleLines', 'InternetService','OnlineSecurity', 
               'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [18]:
def preprocess(df): 
    # preprocessing 
    df = df
    # df["TotalCharges"] = df["TotalCharges"].astype(float)
    df["TotalCharges"] = df["TotalCharges"].replace(' ', np.nan)
    df["TotalCharges"] = df["TotalCharges"].replace('', np.nan)
    df["TotalCharges"] = df["TotalCharges"].astype(float)
    df["TotalCharges"] = df["TotalCharges"].fillna(0)
    # function to map the churn column

    

    # feature engineering 

    # scaling
    scaler = StandardScaler()
    df[numerical] = scaler.fit_transform(df[numerical])
    # one hot encoding 
    encoder = OneHotEncoder(sparse_output=False)
    encoder.fit(df[categorical])
    enc_cols = encoder.get_feature_names_out().tolist()
    df[enc_cols] = encoder.transform(df[categorical])


    return df, enc_cols

In [5]:
# function to display scrollable df 
def display_scrollable_df(df):
    html = df.to_html(classes='table table-striped')
    return HTML(f"""
    <div style="width:100%; overflow:auto; position:relative;">
        {html}
    </div>
    """)

In [19]:
df, enc_cols = preprocess(df)

In [20]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

In [13]:
train, test = train_test_split(df, train_size=0.8, random_state=1)

In [33]:
features = numerical + enc_cols
target = "Churn"

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [31]:
sk_rf = RandomForestClassifier(random_state=1)
lgb_cl = LGBMClassifier(random_state=1)
xgb_cl = XGBClassifier(random_state=1)
xgb_rfcl = XGBRFClassifier(random_state=1)

In [34]:
# fitting all the model with the training set 
sk_rf.fit(X_train, y_train)
lgb_cl.fit(X_train, y_train)
xgb_cl.fit(X_train, y_train)
xgb_rfcl.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [35]:
from sklearn.metrics import accuracy_score

In [36]:
accuracy_score(y_test, sk_rf.predict(X_test))

0.7906316536550745

In [37]:
accuracy_score(y_test, xgb_cl.predict(X_test))

0.7934705464868701

In [38]:
accuracy_score(y_test, lgb_cl.predict(X_test))

0.8133427963094393

In [39]:
from sklearn.ensemble import ExtraTreesClassifier

In [41]:
extra = ExtraTreesClassifier(random_state=1)
extra.fit(X_train, y_train)

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [43]:

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features
}



In [47]:
random = RandomizedSearchCV(
    estimator=extra, param_distributions=hyperparameter_grid,
    cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1
)

In [48]:
random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Balogun Oladimeji\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Balogun Oladimeji\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Balogun Oladimeji\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Balogun Oladim

In [54]:
random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'sqrt'}