<a href="https://colab.research.google.com/github/alvintnw/Advanced_ML_Analysis_for_Business_Insights/blob/main/Advanced_ML_Analysis_for_Business_Insights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Menginstal library tambahan jika diperlukan (misalnya XGBoost, SHAP untuk interpretasi model)
!pip install pandas numpy scikit-learn matplotlib seaborn xgboost shap
!pip install tensorflow # Atau !pip install torch torchvision torchaudio jika menggunakan PyTorch

# Mengimpor library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

import xgboost as xgb
import tensorflow as tf # Atau import torch as th jika menggunakan PyTorch

# Untuk interpretasi model
import shap

# Mengatur tampilan default plot
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Contoh data publik (misal, dari UCI ML Repository)
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

# Contoh data sintetis untuk demonstrasi (prediksi churn)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=0,
                           n_clusters_per_class=1, flip_y=0.1, random_state=42)
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df['target_churn'] = y

print("Data berhasil dimuat. Beberapa baris pertama:")
print(df.head())
print("\nInformasi Data:")
df.info()

In [None]:
# Statistik deskriptif
print("\nStatistik Deskriptif:")
print(df.describe())

# Distribusi variabel target
print("\nDistribusi Variabel Target (Churn):")
print(df['target_churn'].value_counts())
sns.countplot(x='target_churn', data=df)
plt.title('Distribusi Pelanggan Churn vs. Non-Churn')
plt.show()

# Korelasi antar fitur (jika semua numerik)
if df.select_dtypes(include=np.number).shape[1] > 1:
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Matriks Korelasi Fitur')
    plt.show()

# Visualisasi distribusi fitur penting
for col in df.columns[:-1]: # Abaikan kolom target
    plt.figure(figsize=(8, 5))
    sns.histplot(df, x=col, hue='target_churn', kde=True)
    plt.title(f'Distribusi {col} berdasarkan Churn')
    plt.show()

In [None]:
# Memisahkan fitur (X) dan target (y)
X = df.drop('target_churn', axis=1)
y = df['target_churn']

# Memisahkan data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Mengidentifikasi kolom numerik dan kategorikal (untuk contoh ini, asumsikan semua fitur adalah numerik)
# Jika ada kategorikal:
# categorical_features = X.select_dtypes(include=['object', 'category']).columns
# numerical_features = X.select_dtypes(include=np.number).columns

# Untuk demo ini, kita asumsikan semua fitur adalah numerik dan perlu diskalakan
numerical_features = X.columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
        # Jika ada kategorikal:
        # ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

print("Data telah dipisah dan siap untuk pra-pemrosesan.")

In [None]:
print("\nMelatih Model XGBoost...")
# Membuat Pipeline untuk pra-pemrosesan dan model
xgboost_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))])

xgboost_pipeline.fit(X_train, y_train)

# Prediksi dan Evaluasi
y_pred_xgb = xgboost_pipeline.predict(X_test)
y_prob_xgb = xgboost_pipeline.predict_proba(X_test)[:, 1]

print("\nEvaluasi Model XGBoost:")
print(classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix XGBoost:")
print(confusion_matrix(y_test, y_pred_xgb))

# Plot ROC Curve
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)

plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb, tpr_xgb, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_xgb:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - XGBoost')
plt.legend(loc="lower right")
plt.show()

In [None]:
print("\nMelatih Model Deep Learning (TensorFlow)...")

# Pra-pemrosesan data secara terpisah untuk TF (TensorFlow membutuhkan data numerik yang diskalakan)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Membangun model Neural Network
model_nn = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_processed.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid') # Output biner untuk klasifikasi
])

model_nn.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

# Melatih model
history = model_nn.fit(X_train_processed, y_train,
                       epochs=50,
                       batch_size=32,
                       validation_split=0.2,
                       verbose=0) # verbose=0 untuk output yang lebih bersih

# Evaluasi Model
loss_nn, accuracy_nn = model_nn.evaluate(X_test_processed, y_test, verbose=0)
y_prob_nn = model_nn.predict(X_test_processed).flatten()
y_pred_nn = (y_prob_nn > 0.5).astype(int)

print(f"\nEvaluasi Model Neural Network - Akurasi: {accuracy_nn:.4f}, Loss: {loss_nn:.4f}")
print("\nClassification Report Neural Network:")
print(classification_report(y_test, y_pred_nn))

# Plot training history (accuracy dan loss)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Akurasi Model Neural Network selama Pelatihan')
plt.xlabel('Epoch')
plt.ylabel('Akurasi')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Model Neural Network selama Pelatihan')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
print("\nInterpretasi Model Menggunakan SHAP...")

# Inisialisasi SHAP explainer
# Pastikan Anda mendapatkan model classifier dari pipeline
explainer = shap.TreeExplainer(xgboost_pipeline.named_steps['classifier'])
shap_values = explainer.shap_values(preprocessor.transform(X_test))

# Ringkasan plot dari nilai SHAP
# Perhatikan bahwa fitur_names berasal dari X.columns
shap.summary_plot(shap_values, preprocessor.transform(X_test), feature_names=X.columns)
plt.title('Ringkasan Pentingnya Fitur Global (SHAP)')
plt.show()

# Plot untuk observasi tunggal (misalnya, prediksi untuk 5 baris pertama dari test set)
print("\nDetail Interpretasi untuk Beberapa Prediksi:")
for i in range(5):
    print(f"Interpretasi untuk Observasi {i+1} (Prediksi: {'Churn' if y_pred_xgb[i] == 1 else 'Non-Churn'}):")
    shap.initjs() # Untuk rendering JS di notebook
    shap.force_plot(explainer.expected_value, shap_values[i], preprocessor.transform(X_test)[i], feature_names=X.columns)
    plt.show()

In [None]:
print("\nVisualisasi Wawasan Tambahan...")

# Contoh: Box plot untuk membandingkan distribusi fitur penting antara churn dan non-churn
# Ganti 'feature_X' dengan nama fitur dari data Anda yang relevan dari SHAP
plt.figure(figsize=(10, 6))
sns.boxplot(x='target_churn', y='feature_0', data=df) # Ganti 'feature_0' dengan fitur yang relevan
plt.title('Distribusi Feature_0 berdasarkan Churn')
plt.xlabel('Churn (0: No, 1: Yes)')
plt.ylabel('Feature_0 Value')
plt.show()

# Contoh: Scatter plot dengan regresi linier sederhana (jika ada hubungan antara dua fitur)
# plt.figure(figsize=(10, 6))
# sns.scatterplot(x='feature_1', y='feature_2', hue='target_churn', data=df)
# plt.title('Hubungan antara Feature_1 dan Feature_2')
# plt.show()