In [33]:
# 1. Import Library
# ============================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
import pandas as pd
import numpy as np

customer = pd.read_csv("./customer-data.csv", sep=";")
churn = pd.read_csv("./churn-status.csv" , sep=";")
transaksi = pd.read_csv("./transaction-data.csv" , sep=";")

df = customer.merge(churn, on="Customer_ID", how="inner")
df = df.merge(transaksi, on="Customer_ID", how="inner")
df

Unnamed: 0,Customer_ID,Age,Gender,Annual_Income,Target_Churn,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Promotion_Response
0,1,62,Other,45.15,True,5892.58,5,22,453.80,2,0,3,129,True,Responded
1,2,65,Male,79.51,False,9025.47,13,77,22.90,2,2,3,227,False,Responded
2,3,18,Male,29.19,True,618.83,13,71,50.53,5,2,2,283,False,Responded
3,4,21,Other,79.63,True,9110.30,3,33,411.83,5,3,5,226,True,Ignored
4,5,21,Other,77.66,False,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,54,Male,143.72,False,1089.09,2,29,77.75,0,3,2,88,True,Ignored
996,997,19,Male,164.19,True,3700.24,9,90,34.45,6,4,4,352,False,Responded
997,998,47,Female,113.31,False,705.85,17,69,187.37,7,3,1,172,True,Unsubscribed
998,999,23,Male,72.98,True,3891.60,7,31,483.80,1,2,5,55,False,Responded


In [None]:
df.head()

Unnamed: 0,Customer_ID,Age,Gender,Annual_Income,Target_Churn,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Promotion_Response
0,1,62,Other,45.15,True,5892.58,5,22,453.8,2,0,3,129,True,Responded
1,2,65,Male,79.51,False,9025.47,13,77,22.9,2,2,3,227,False,Responded
2,3,18,Male,29.19,True,618.83,13,71,50.53,5,2,2,283,False,Responded
3,4,21,Other,79.63,True,9110.3,3,33,411.83,5,3,5,226,True,Ignored
4,5,21,Other,77.66,False,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed


In [None]:
df['Target_Churn']

0       True
1      False
2       True
3       True
4      False
       ...  
995    False
996     True
997    False
998     True
999     True
Name: Target_Churn, Length: 1000, dtype: bool

In [None]:
df["Target_Churn"]

In [None]:
# ============================================================
# 4. Data Understanding & Cleaning
# ============================================================
# Cek missing values
print(df.isnull().sum())

# Tangani missing values jika ada
df.fillna({
    "Satisfaction_Score": df["Satisfaction_Score"].median(),
    "Annual_Income": df["Annual_Income"].median()
}, inplace=True)

# Ubah target ke biner (0=Tidak churn, 1=Churn)
df["Target_Churn"] = df["Target_Churn"].map({"True": 1, "False": 0})
df

Customer_ID                   0
Age                           0
Gender                        0
Annual_Income                 0
Target_Churn                  0
Total_Spend                   0
Years_as_Customer             0
Num_of_Purchases              0
Average_Transaction_Amount    0
Num_of_Returns                0
Num_of_Support_Contacts       0
Satisfaction_Score            0
Last_Purchase_Days_Ago        0
Email_Opt_In                  0
Promotion_Response            0
dtype: int64


Unnamed: 0,Customer_ID,Age,Gender,Annual_Income,Target_Churn,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Promotion_Response
0,1,62,Other,45.15,,5892.58,5,22,453.80,2,0,3,129,True,Responded
1,2,65,Male,79.51,,9025.47,13,77,22.90,2,2,3,227,False,Responded
2,3,18,Male,29.19,,618.83,13,71,50.53,5,2,2,283,False,Responded
3,4,21,Other,79.63,,9110.30,3,33,411.83,5,3,5,226,True,Ignored
4,5,21,Other,77.66,,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,54,Male,143.72,,1089.09,2,29,77.75,0,3,2,88,True,Ignored
996,997,19,Male,164.19,,3700.24,9,90,34.45,6,4,4,352,False,Responded
997,998,47,Female,113.31,,705.85,17,69,187.37,7,3,1,172,True,Unsubscribed
998,999,23,Male,72.98,,3891.60,7,31,483.80,1,2,5,55,False,Responded


In [22]:
# ============================================================
# 5. Feature Selection
# ============================================================
X = df.drop(columns=["Customer_ID", "Target_Churn"])
y = df["Target_Churn"]

# Identifikasi kolom kategorikal dan numerikal
cat_features = ["Gender", "Email_Opt_In", "Promotion_Response"]
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()


In [23]:
# ============================================================
# 6. Preprocessing Pipeline
# ============================================================
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

In [24]:
df

Unnamed: 0,Customer_ID,Age,Gender,Annual_Income,Target_Churn,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Promotion_Response
0,1,62,Other,45.15,,5892.58,5,22,453.80,2,0,3,129,True,Responded
1,2,65,Male,79.51,,9025.47,13,77,22.90,2,2,3,227,False,Responded
2,3,18,Male,29.19,,618.83,13,71,50.53,5,2,2,283,False,Responded
3,4,21,Other,79.63,,9110.30,3,33,411.83,5,3,5,226,True,Ignored
4,5,21,Other,77.66,,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,54,Male,143.72,,1089.09,2,29,77.75,0,3,2,88,True,Ignored
996,997,19,Male,164.19,,3700.24,9,90,34.45,6,4,4,352,False,Responded
997,998,47,Female,113.31,,705.85,17,69,187.37,7,3,1,172,True,Unsubscribed
998,999,23,Male,72.98,,3891.60,7,31,483.80,1,2,5,55,False,Responded


In [19]:
# ============================================================
# 7. Train-Test Split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

ValueError: Input y contains NaN.

In [None]:


# ============================================================
# 9. Evaluation
# ============================================================
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ============================================================
# 10. (Opsional) Random Forest Model
# ============================================================
model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print("Random Forest ROC-AUC:", roc_auc_score(y_test, y_pred_rf))

# ============================================================
# 11. Feature Importance (Jika pakai Random Forest)
# ============================================================
rf_model = model_rf.named_steps['classifier']
importances = rf_model.feature_importances_

# Gabungkan feature name hasil encoding
feature_names = (
    num_features +
    list(model_rf.named_steps['preprocessor']
         .transformers_[1][1]
         .named_steps['encoder']
         .get_feature_names_out(cat_features))
)

feature_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x="Importance", y="Feature", data=feature_importance.head(10))
plt.title("Top 10 Important Features for Churn Prediction")
plt.show()
