In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data\Customer-Churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [8]:
data['TotalCharges'].fillna(data['tenure'] * data['MonthlyCharges'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TotalCharges'].fillna(data['tenure'] * data['MonthlyCharges'], inplace=True)


In [10]:
data['SeniorCitizen'] = data['SeniorCitizen'].astype(object)

In [11]:
data['MultipleLines'] = data['MultipleLines'].replace('No phone service', 'No')

In [13]:
columns_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

In [15]:
for column in columns_to_replace:
    data[column] = data[column].replace('No phone service', 'No')

In [21]:
data['Churn'] = data['Churn'].replace({'No': 0, 'Yes': 1})

  data['Churn'] = data['Churn'].replace({'No': 0, 'Yes': 1})


In [24]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

In [25]:
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(strat_split.split(data, data['Churn']))

In [27]:
strat_train_set = data.loc[train_index]
strat_test_set = data.loc[test_index]

In [28]:
X_train = strat_train_set.drop('Churn', axis=1)
Y_train = strat_train_set['Churn'].copy()

In [29]:
x_test = strat_test_set.drop('Churn', axis=1)
y_test = strat_test_set['Churn'].copy()

In [30]:
categorical_columns = data.select_dtypes(include=['object']).columns.to_list()

In [32]:
from sklearn.metrics import (f1_score, accuracy_score, classification_report, recall_score, confusion_matrix, precision_score, roc_auc_score, roc_curve, auc)

from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import OrdinalEncoder

In [69]:
cat_model = CatBoostClassifier(random_state=0, scale_pos_weight=None, verbose=False)
cat_model.fit(X_train, Y_train, cat_features=categorical_columns, eval_set=(x_test, y_test))

<catboost.core.CatBoostClassifier at 0x2690bb0c130>

In [70]:
y_pred = cat_model.predict(x_test)

In [71]:
accuracy, recall, roc_auc, precision = [round(metric(y_test, y_pred), 4) for metric in [accuracy_score, recall_score, roc_auc_score, precision_score]] 

In [72]:
model_name = ['Catboost Model']
result = pd.DataFrame({'Accuracy': accuracy, 'Recall Score': recall, 'Roc_Auc': roc_auc, 'Precision_Score': precision}, index=model_name)

In [76]:
print(result)

                Accuracy  Recall Score  Roc_Auc  Precision_Score
Catboost Model    0.8034         0.508   0.7091           0.6714


In [77]:
import os
model_directory = 'model'
model_path = os.path.join(model_directory, "catboost_model.cbm")

In [78]:
cat_model.save_model(model_path)