In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('../datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv', usecols=lambda col: col != "customerID")

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan).astype(float)

In [5]:
X = df.drop(columns="Churn", axis=1)
y = df["Churn"].map({'No': 0, 'Yes': 1})

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
categorical_features = X_train.select_dtypes(include=['object']).columns

In [8]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
])

In [9]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lgbm', LGBMClassifier())
])

In [10]:
LGBMClassifier().get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [17]:
param_grid = {
    'lgbm__boosting_type': ['gbdt', 'dart', 'rf', 'goss'],
    'lgbm__learning_rate': [0.1, 0.3],
    'lgbm__max_depth': [3, 5, 7],
    'lgbm__n_estimators': [100, 300, 1000]
}

In [18]:
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=5, 
    n_jobs=-1, 
    verbose=1
    )

In [19]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


90 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Caio Lacerda\Desktop\APPS\Exercícios\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Caio Lacerda\Desktop\APPS\Exercícios\venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Caio Lacerda\Desktop\APPS\Exercícios\venv\Lib\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params[

In [20]:
grid_search.best_params_

{'lgbm__boosting_type': 'dart',
 'lgbm__learning_rate': 0.1,
 'lgbm__max_depth': 7,
 'lgbm__n_estimators': 100}

In [21]:
y_pred = grid_search.predict(X_test)



In [22]:
print(accuracy_score(y_test, y_pred))

0.7927608232789212
