In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("data/main_dataset_cleaned.csv")

In [3]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7005,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7006,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7007,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7008,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [4]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [6]:
ct = ColumnTransformer([("onehot", OneHotEncoder(), [0, 2, 3]+[i for i in range(5, 17)])], 
                       remainder="passthrough")

le = LabelEncoder()

In [7]:
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [8]:
scaler = StandardScaler()

In [9]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
xgb = XGBClassifier(random_state=42)

In [11]:
xgb.fit(X_train, y_train)

In [12]:
print(f"{classification_report(y_test, xgb.predict(X_test))}\n\n\n{classification_report(y_train, xgb.predict(X_train))}")

              precision    recall  f1-score   support

           0       0.87      0.88      0.87      1081
           1       0.56      0.54      0.55       321

    accuracy                           0.80      1402
   macro avg       0.72      0.71      0.71      1402
weighted avg       0.80      0.80      0.80      1402



              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4072
           1       0.91      0.86      0.89      1536

    accuracy                           0.94      5608
   macro avg       0.93      0.92      0.92      5608
weighted avg       0.94      0.94      0.94      5608



In [66]:
params = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

In [67]:
rcv = RandomizedSearchCV(XGBClassifier(random_state=42), 
                         param_distributions=params, 
                         scoring="precision", 
                         cv=5, n_jobs=-1, 
                         refit=True, 
                         random_state=42)

In [68]:
rcv.fit(X_train, y_train)

In [69]:
rcv.best_params_

{'subsample': 0.6,
 'reg_lambda': 1,
 'reg_alpha': 0,
 'n_estimators': 100,
 'min_child_weight': 1,
 'max_depth': 5,
 'learning_rate': 0.01,
 'gamma': 0.1,
 'colsample_bytree': 0.8}

In [70]:
xgb_classifier = XGBClassifier(objective='reg:squarederror', 
                               random_state=42, 
                               subsample=.6, 
                               reg_lambda=1, 
                               reg_alpha=0, 
                               n_estimators=100, 
                               min_child_weight=1, 
                               max_depth=5, 
                               learning_rate=0.01, 
                               gamma=0.1, 
                               colsample_bytree=0.8)

In [71]:
model = Pipeline([
    ("scaler", scaler),
    ("classifier", xgb_classifier)
])

In [72]:
model.fit(X_train, y_train)

In [73]:
print(f"{classification_report(y_test, model.predict(X_test))}\n\n\n{classification_report(y_train, model.predict(X_train))}")

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1081
           1       0.75      0.29      0.41       321

    accuracy                           0.81      1402
   macro avg       0.78      0.63      0.65      1402
weighted avg       0.80      0.81      0.78      1402



              precision    recall  f1-score   support

           0       0.79      0.98      0.87      4072
           1       0.82      0.29      0.43      1536

    accuracy                           0.79      5608
   macro avg       0.80      0.63      0.65      5608
weighted avg       0.80      0.79      0.75      5608



In [74]:
import pickle as pkl

pkl.dump(model, open("models/model.pkl", "wb"))