In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from IPython.display import display
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data_telco_customer_churn.csv')
df.head(10)

Unnamed: 0,Dependents,tenure,OnlineSecurity,OnlineBackup,InternetService,DeviceProtection,TechSupport,Contract,PaperlessBilling,MonthlyCharges,Churn
0,Yes,9,No,No,DSL,Yes,Yes,Month-to-month,Yes,72.9,Yes
1,No,14,No,Yes,Fiber optic,Yes,No,Month-to-month,Yes,82.65,No
2,No,64,Yes,No,DSL,Yes,Yes,Two year,No,47.85,Yes
3,No,72,Yes,Yes,DSL,Yes,Yes,Two year,No,69.65,No
4,No,3,No internet service,No internet service,No,No internet service,No internet service,Month-to-month,Yes,23.6,No
5,Yes,40,No,No,DSL,Yes,No,Month-to-month,Yes,74.55,No
6,No,17,No internet service,No internet service,No,No internet service,No internet service,Month-to-month,No,19.7,No
7,No,11,No,No,DSL,No,No,Month-to-month,Yes,44.05,Yes
8,No,8,No,No,Fiber optic,No,No,Month-to-month,Yes,73.5,No
9,Yes,47,Yes,No,DSL,Yes,Yes,One year,No,40.3,No


In [3]:
df = df.drop_duplicates()
print(df.shape)

(4853, 11)


In [4]:
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

In [5]:
# Memisah fitur (X) dan target (y)
X = df.drop('Churn', axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Memisah data latih dan data uji
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_test.value_counts())

Churn
0    706
1    265
Name: count, dtype: int64


In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['tenure', 'MonthlyCharges']

# Semua kolom kategorikal (termasuk yang biner dan yang multi)
categorical_features = ['Dependents', 'OnlineSecurity', 'OnlineBackup',
                        'InternetService', 'DeviceProtection', 'TechSupport',
                        'Contract', 'PaperlessBilling']

In [7]:
# FIT preprocessor pada X_train
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

preprocessor.fit(X_train)

# TRANSFORM X_train
X_train_preprocessed = preprocessor.transform(X_train)

# TRANSFORM X_test
X_test_preprocessed = preprocessor.transform(X_test)

In [8]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

# Impor semua model
from sklearn.linear_model import LogisticRegression

In [9]:
logreg = LogisticRegression(random_state=42)

In [10]:
from imblearn.over_sampling import ADASYN
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [11]:
print("Menerapkan ADASYN pada data latih")

# Inisialisasi ADASYN
ada = ADASYN(random_state=42)

X_train_resampled, y_train_resampled = ada.fit_resample(X_train_preprocessed, y_train)

print("Resampling ADASYN selesai.")
print("Jumlah data baru:", pd.Series(y_train_resampled).value_counts().to_dict())
print("\n" + "="*60 + "\n")

###  Logistic Regression + ADASYN

print("--- Melatih Model: Logistic Regression + ADASYN ---")

# Inisialisasi dan latih model
model_logreg_ada = LogisticRegression(random_state=42)
model_logreg_ada.fit(X_train_resampled, y_train_resampled)
print("Model LogReg selesai dilatih.")

# Evaluasi model
y_pred_logreg_ada = model_logreg_ada.predict(X_test_preprocessed)

# Tampilkan hasil
print("\n--- Hasil Evaluasi: Logistic Regression + ADASYN ---")
print(f"Akurasi: {accuracy_score(y_test, y_pred_logreg_ada) * 100:.2f}%")
print("\nClassification Report (LogReg):")
print(classification_report(y_test, y_pred_logreg_ada))
print("\nConfusion Matrix (LogReg):")
print(confusion_matrix(y_test, y_pred_logreg_ada))
print("\n" + "="*60 + "\n")

Menerapkan ADASYN pada data latih
Resampling ADASYN selesai.
Jumlah data baru: {1: 2894, 0: 2859}


--- Melatih Model: Logistic Regression + ADASYN ---
Model LogReg selesai dilatih.

--- Hasil Evaluasi: Logistic Regression + ADASYN ---
Akurasi: 73.43%

Classification Report (LogReg):
              precision    recall  f1-score   support

           0       0.90      0.71      0.80       706
           1       0.51      0.80      0.62       265

    accuracy                           0.73       971
   macro avg       0.71      0.75      0.71       971
weighted avg       0.80      0.73      0.75       971


Confusion Matrix (LogReg):
[[501 205]
 [ 53 212]]




In [12]:
best_model = model_logreg_ada

In [None]:
from sklearn.pipeline import Pipeline
import joblib

final_pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocessor fit
    ('classifier', best_model) # Model LogReg
])

filename = 'churn_prediction_model.joblib'
joblib.dump(final_pipeline , filename)

print("Pipeline joblib berhasil disimpan!'")

Pipeline joblib berhasil disimpan!'


In [14]:
import pickle

filename = 'churn_prediction_model.pkl'
with open(filename, 'wb') as f:
    pickle.dump(final_pipeline, f)
    
    print("Pipeline pkl berhasil disimpan!'")


Pipeline pkl berhasil disimpan!'
