# Debug: Validasi Random Forest Match dengan Colab
Notebook untuk membandingkan hasil RF sistem dengan Colab dan mengidentifikasi penyebab perbedaan.


In [2]:
import pandas as pd
import numpy as np
from statistics import mode
from sklearn.metrics import classification_report, accuracy_score, f1_score
import sys
import os

# Pastikan bisa import dari local models
sys.path.insert(0, r'd:\jep\PCA PDAM\GUI')
from models.random_forest import (
    apply_rf_manual, 
    evaluate_predictions, 
    DEFAULT_AMBANG,
    split_train_test_colab_like
)

print("‚úÖ Library berhasil diimport")


‚úÖ Library berhasil diimport


## Section 1: Setup Paths dan Load Data KMeans
Tentukan lokasi file output dari KMeans untuk digunakan dalam RF test.


In [3]:
# UBAH PATH INI SESUAI LOKASI FILE KMEANS OUTPUT ANDA
kmeans_output_path = r"d:\jep\PCA PDAM\GUI\outputs\clustering\<nama_file_kmeans>.xlsx"

# Cari file terakhir di outputs/clustering jika pathnya belum tahu
import glob
clustering_outputs = glob.glob(r"d:\jep\PCA PDAM\GUI\outputs\clustering\*.xlsx")
if clustering_outputs:
    kmeans_output_path = max(clustering_outputs, key=os.path.getctime)
    print(f"‚úÖ Menggunakan file KMeans terbaru: {kmeans_output_path}")
else:
    print("‚ö†Ô∏è  Tidak ada file KMeans ditemukan! Silakan run KMeans terlebih dahulu.")

# Load data
try:
    df_raw = pd.read_excel(kmeans_output_path, sheet_name="Data+Cluster")
    print(f"‚úÖ Data loaded: {len(df_raw)} baris, {df_raw.shape[1]} kolom")
    print(f"Kolom: {list(df_raw.columns)}")
    print(f"\nData preview:")
    print(df_raw.head())
except Exception as e:
    print(f"‚ùå Error loading data: {e}")


‚ö†Ô∏è  Tidak ada file KMeans ditemukan! Silakan run KMeans terlebih dahulu.
‚ùå Error loading data: [Errno 2] No such file or directory: 'd:\\jep\\PCA PDAM\\GUI\\outputs\\clustering\\<nama_file_kmeans>.xlsx'


## Section 2: Persiapan Data - Rename Kolom Cluster & Ambang
Standarisasi nama kolom dan gunakan ambang Colab.


In [None]:
# Standardisasi nama kolom
if "Cluster Aktual" not in df_raw.columns and "Cluster" in df_raw.columns:
    df_raw.rename(columns={"Cluster": "Cluster Aktual"}, inplace=True)
    print("‚úÖ Kolom 'Cluster' direname ke 'Cluster Aktual'")

# Pastikan kolom PCA ada
pca_cols = ["PCA1", "PCA2", "PCA3", "PCA4"]
missing_cols = [c for c in pca_cols if c not in df_raw.columns]
if missing_cols:
    print(f"‚ùå Kolom PCA tidak lengkap. Missing: {missing_cols}")
else:
    print(f"‚úÖ Semua kolom PCA ada")

# Ambang Colab
ambang_colab = {
    'PCA1': {'low': -0.471222, 'mid': 0.740738},
    'PCA2': {'low': -0.496107, 'mid': 0.799846},
    'PCA3': {'low': -0.143652, 'mid': 0.300361},
    'PCA4': {'low': -0.206872, 'mid': 0.633076},
}

print(f"\n‚úÖ Ambang Colab siap: {ambang_colab}")


## Section 3: Split Data Train/Test dengan Metode Colab-like


In [None]:
train_ratio = 0.8
random_state = 42

# Split menggunakan fungsi Colab-like dari RF module
df_train, df_test = split_train_test_colab_like(
    df_raw,
    train_ratio=train_ratio,
    random_state=random_state
)

print(f"‚úÖ Data split berhasil:")
print(f"  - Training: {len(df_train)} baris ({len(df_train)/len(df_raw)*100:.1f}%)")
print(f"  - Testing:  {len(df_test)} baris ({len(df_test)/len(df_raw)*100:.1f}%)")
print(f"  - Total:    {len(df_train) + len(df_test)} baris")

# Cek distribusi cluster
print(f"\nüìä Distribusi Cluster di Training:")
print(df_train["Cluster Aktual"].value_counts().sort_index())
print(f"\nüìä Distribusi Cluster di Testing:")
print(df_test["Cluster Aktual"].value_counts().sort_index())


## Section 4: Apply RF Manual ke Training Data


In [None]:
try:
    df_train_pred = apply_rf_manual(df_train.copy(), ambang_colab)
    print("‚úÖ RF Manual applied ke training data")
    print(f"Kolom hasil: {[c for c in df_train_pred.columns if 'Tree' in c or 'Voting' in c]}")
    print(f"\nSample predictions (training):")
    print(df_train_pred[["PCA1", "Cluster Aktual", "Tree1", "Tree2", "Tree3", "Tree4", "Tree5", "Voting Mayoritas"]].head(10))
except Exception as e:
    print(f"‚ùå Error apply RF: {e}")
    import traceback
    traceback.print_exc()


## Section 5: Evaluasi Training Data


In [None]:
labels = ["C1", "C2", "C3"]

try:
    report_train, acc_train, macro_f1_train = evaluate_predictions(
        df_train_pred,
        actual_col="Cluster Aktual",
        pred_col="Voting Mayoritas",
        labels=labels
    )
    
    print("="*60)
    print("üìä EVALUASI TRAINING DATA")
    print("="*60)
    print(f"\n‚úÖ Accuracy:  {acc_train:.4f}")
    print(f"‚úÖ Macro F1:  {macro_f1_train:.4f}")
    
    print(f"\nüìã Classification Report:")
    report_df = pd.DataFrame(report_train).transpose()
    print(report_df[["precision", "recall", "f1-score", "support"]])
    
except Exception as e:
    print(f"‚ùå Error evaluasi: {e}")
    import traceback
    traceback.print_exc()


## Section 6: Apply RF Manual ke Testing Data


In [None]:
try:
    df_test_pred = apply_rf_manual(df_test.copy(), ambang_colab)
    print("‚úÖ RF Manual applied ke testing data")
    print(f"\nSample predictions (testing):")
    print(df_test_pred[["PCA1", "Cluster Aktual", "Tree1", "Tree2", "Tree3", "Tree4", "Tree5", "Voting Mayoritas"]].head(10))
except Exception as e:
    print(f"‚ùå Error apply RF ke test data: {e}")
    import traceback
    traceback.print_exc()


## Section 7: Evaluasi Testing Data


In [None]:
try:
    report_test, acc_test, macro_f1_test = evaluate_predictions(
        df_test_pred,
        actual_col="Cluster Aktual",
        pred_col="Voting Mayoritas",
        labels=labels
    )
    
    print("="*60)
    print("üìä EVALUASI TESTING DATA")
    print("="*60)
    print(f"\n‚úÖ Accuracy:  {acc_test:.4f}")
    print(f"‚úÖ Macro F1:  {macro_f1_test:.4f}")
    
    print(f"\nüìã Classification Report:")
    report_df_test = pd.DataFrame(report_test).transpose()
    print(report_df_test[["precision", "recall", "f1-score", "support"]])
    
except Exception as e:
    print(f"‚ùå Error evaluasi: {e}")
    import traceback
    traceback.print_exc()


## Section 8: Perbandingan Sistem vs Colab
Bandingkan hasil sistem dengan hasil yang Anda dapatkan dari Colab.


In [None]:
print("="*70)
print("üîç RINGKASAN PERBANDINGAN HASIL")
print("="*70)
print("\nüìå TRAINING DATA:")
print(f"   Sistem  ‚Üí Accuracy: {acc_train:.4f}, Macro F1: {macro_f1_train:.4f}")
print(f"   Colab   ‚Üí Accuracy: ???, Macro F1: ???")
print(f"   ‚Üí MASUKKAN HASIL COLAB DI ATAS UNTUK PERBANDINGAN")

print("\nüìå TESTING DATA:")
print(f"   Sistem  ‚Üí Accuracy: {acc_test:.4f}, Macro F1: {macro_f1_test:.4f}")
print(f"   Colab   ‚Üí Accuracy: ???, Macro F1: ???")
print(f"   ‚Üí MASUKKAN HASIL COLAB DI ATAS UNTUK PERBANDINGAN")

print("\n" + "="*70)
print("üí° CATATAN PENTING:")
print("="*70)
print("""
1. Jika hasil masih berbeda, kemungkinan penyebab:
   a) Data input KMeans berbeda (label cluster 0/1/2 vs C1/C2/C3)
   b) Urutan data split berbeda (random_state/seed berbeda)
   c) Nilai PCA memiliki tanda yang berbeda (+/- flip)
   
2. Untuk debugging lebih lanjut:
   - Bandingkan nilai PCA1-PCA4 di kolom tertentu
   - Cek apakah label cluster sama (0/1/2 vs C1/C2/C3)
   - Validasi ambang PCA sudah benar
   
3. Fix yang dapat dicoba:
   - Konfigurasi UI: ubah "Alignment" ke "Off (tanpa auto)"
   - Pastikan split_method = "colab_like"
   - Gunakan data yang sama dengan Colab (file input)
""")
