### 1. Load Datasets and Artifacts

In [8]:
import pandas as pd
import joblib

df = pd.read_csv("Datasets/preprocessed_train.csv")
imp_feats = joblib.load("Artifacts/important_features.pkl")
important_features = imp_feats["Important Features"]

df

Unnamed: 0.1,Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,0,2.0,30.0,0,39.0,14.0,5.0,18.0,2,12,932.00,17.0,1.0
1,1,3.0,65.0,0,49.0,1.0,10.0,8.0,0,1,557.00,6.0,1.0
2,2,4.0,55.0,0,14.0,4.0,6.0,18.0,0,3,185.00,3.0,1.0
3,3,5.0,58.0,1,38.0,21.0,7.0,7.0,2,1,396.00,29.0,1.0
4,4,6.0,23.0,1,32.0,20.0,5.0,8.0,0,1,617.00,20.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
440827,440828,449995.0,42.0,1,54.0,15.0,1.0,3.0,1,12,716.38,8.0,0.0
440828,440829,449996.0,25.0,0,8.0,13.0,1.0,20.0,1,12,745.38,2.0,0.0
440829,440830,449997.0,26.0,1,35.0,27.0,1.0,5.0,2,3,977.31,9.0,0.0
440830,440831,449998.0,28.0,1,55.0,14.0,2.0,0.0,2,3,602.55,2.0,0.0


### 2. Train-Val Split Strategy

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

target_column = "Churn"

df_X = df[important_features]
df_y = df[target_column]

scaler = StandardScaler()

df_X.loc[:, important_features] = (
    df_X[important_features].astype("float64")
)

df_X.loc[:, important_features] = scaler.fit_transform(df_X[important_features])

X_train, X_val, y_train, y_val = train_test_split(df_X, df_y, test_size = 0.2, stratify=df_y, random_state = 42)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_X.loc[:, important_features] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_X.loc[:, important_features] = scaler.fit_transform(df_X[important_features])


Use 80% of train dataset for training and 20% the rest for internal validation

In [10]:
# save scaler artifact
joblib.dump(scaler, "Artifacts/standard_scaler.pkl")

['Artifacts/standard_scaler.pkl']

### 3. Model Development

### A. Gaussian Naive-Baiyes Classifier

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

GNB_model = GaussianNB()


param_grid_GNB = {
    'var_smoothing': [1e-12, 1e-10, 1e-9, 1e-8, 1e-7]
}


grid_GNB = GridSearchCV(
    estimator=GNB_model,
    param_grid=param_grid_GNB,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# fit
grid_GNB.fit(X_train, y_train)

print("Best params:", grid_GNB.best_params_)
print("Best CV score:", grid_GNB.best_score_)

Best params: {'var_smoothing': 1e-12}
Best CV score: 0.9098772985328804


In [12]:
# save the best model
GNB_model = grid_GNB.best_estimator_
joblib.dump(GNB_model, "Saved Models/GNB.pkl")

['Saved Models/GNB.pkl']

### Predict with the best model

In [14]:
from sklearn import metrics

y_pred = GNB_model.predict(X_val)

accuracy = metrics.accuracy_score(y_val, y_pred)
recall = metrics.recall_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred)
F1_score = metrics.f1_score(y_val, y_pred)
print("Validation accuracy: ", accuracy)
print("Validation recall: ", recall)
print("Validation precision: ", precision)
print("Validation F1-score: ", F1_score)

Validation accuracy:  0.9008926242244831
Validation recall:  0.87742
Validation precision:  0.943868330464716
Validation F1-score:  0.9094320066334992


### B. AdaBoost Classifier

In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

ADA_model = AdaBoostClassifier(random_state=42)

param_grid_ADA = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

grid_ADA = GridSearchCV(
    estimator=ADA_model,
    param_grid=param_grid_ADA,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

grid_ADA.fit(X_train, y_train)

print("Best params:", grid_ADA.best_params_)
print("Best CV score:", grid_ADA.best_score_)

Best params: {'learning_rate': 0.1, 'n_estimators': 200}
Best CV score: 0.9562048387817214


In [16]:
# save the best model
ADA_model = grid_ADA.best_estimator_
joblib.dump(ADA_model, "Saved Models/ADA.pkl")

['Saved Models/ADA.pkl']

### Predict with the best model

In [17]:
from sklearn import metrics

y_pred = ADA_model.predict(X_val)

accuracy = metrics.accuracy_score(y_val, y_pred)
recall = metrics.recall_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred)
F1_score = metrics.f1_score(y_val, y_pred)
print("Validation accuracy: ", accuracy)
print("Validation recall: ", recall)
print("Validation precision: ", precision)
print("Validation F1-score: ", F1_score)

Validation accuracy:  0.9521476289314597
Validation recall:  0.92062
Validation precision:  0.9945982152503187
Validation F1-score:  0.956180347108983


### C. Catboost Classifier

In [18]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

CAT_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    loss_function='Logloss'
)

param_grid_CAT = {
    'depth': [4, 6, 8],
    'learning_rate': [0.03, 0.1],
    'iterations': [200, 500]
}

grid_CAT = GridSearchCV(
    estimator=CAT_model,
    param_grid=param_grid_CAT,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

grid_CAT.fit(X_train, y_train)

print("Best params:", grid_CAT.best_params_)
print("Best CV score:", grid_CAT.best_score_)


Best params: {'depth': 4, 'iterations': 200, 'learning_rate': 0.1}
Best CV score: 0.9727045814763637


In [19]:
# save the best model
CAT_model = grid_CAT.best_estimator_
joblib.dump(CAT_model, "Saved Models/CAT.pkl")

['Saved Models/CAT.pkl']

### Predict with the best model

In [20]:
from sklearn import metrics

y_pred = CAT_model.predict(X_val)

accuracy = metrics.accuracy_score(y_val, y_pred)
recall = metrics.recall_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred)
F1_score = metrics.f1_score(y_val, y_pred)
print("Validation accuracy: ", accuracy)
print("Validation recall: ", recall)
print("Validation precision: ", precision)
print("Validation F1-score: ", F1_score)

Validation accuracy:  0.9698072975149432
Validation recall:  0.94676
Validation precision:  1.0
Validation F1-score:  0.9726519961371715


### D. Decision Tree Classifier

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

DT_model = DecisionTreeClassifier(
    random_state=42,
    class_weight='balanced'  # bagus buat churn (imbalanced)
)

param_grid_DT = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_DT = GridSearchCV(
    estimator=DT_model,
    param_grid=param_grid_DT,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

grid_DT.fit(X_train, y_train)

print("Best params:", grid_DT.best_params_)
print("Best CV score:", grid_DT.best_score_)

Best params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Best CV score: 0.9728502547354188


In [22]:
# save the best model
DT_model = grid_DT.best_estimator_
joblib.dump(DT_model, "Saved Models/DT.pkl")

['Saved Models/DT.pkl']

### Predict with the best model

In [23]:
from sklearn import metrics

y_pred = DT_model.predict(X_val)

accuracy = metrics.accuracy_score(y_val, y_pred)
recall = metrics.recall_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred)
F1_score = metrics.f1_score(y_val, y_pred)
print("Validation accuracy: ", accuracy)
print("Validation recall: ", recall)
print("Validation precision: ", precision)
print("Validation F1-score: ", F1_score)

Validation accuracy:  0.969977429196865
Validation recall:  0.94726
Validation precision:  0.9997889092943237
Validation F1-score:  0.9728158729832705
