Import library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

Dataset overview

In [2]:
# Load dataset Panel A
panel_a = pd.read_csv(r'C:\Users\afifn\Documents\TES MULTIMEDIA SOLUSI PRIMA\Dataset\processed\panelAs_labeled.csv')
print(f"Panel A loaded - Shape: {panel_a.shape}")

# Load dataset Panel B
panel_b = pd.read_csv(r'C:\Users\afifn\Documents\TES MULTIMEDIA SOLUSI PRIMA\Dataset\processed\panelBs_labeled.csv')
print(f"Panel B loaded - Shape: {panel_b.shape}")

print("\n" + "="*80)
print("DATASET OVERVIEW")
print("="*80)

print("\n[Panel A] First 5 rows:")
print(panel_a.head())
print("\n[Panel A] Info:")
print(panel_a.info())
print("\n[Panel A] Missing Values:")
print(panel_a.isnull().sum())

print("\n[Panel B] First 5 rows:")
print(panel_b.head())
print("\n[Panel B] Info:")
print(panel_b.info())
print("\n[Panel B] Missing Values:")
print(panel_b.isnull().sum())

print(f"\n[Panel A] Total samples: {len(panel_a)}")
print(f"[Panel B] Total samples: {len(panel_b)}")

print("\nDistribusi Label Panel A:")
print(panel_a['quality_label'].value_counts())
print("\nDistribusi Label Panel B:")
print(panel_b['quality_label'].value_counts())


Panel A loaded - Shape: (1485410, 9)
Panel B loaded - Shape: (1993401, 10)

DATASET OVERVIEW

[Panel A] First 5 rows:
       id                      timestamp  flow1  turbidity    ph       tds  \
0  392388  1970-01-01 07:34:52.135 +0700    0.0   6.682701  7.85  154.5025   
1  392389  1970-01-01 07:34:57.134 +0700    0.0   6.687761  7.85  154.5198   
2  392390  1970-01-01 07:35:06.931 +0700    0.0   6.691935  7.85  154.5198   
3  392391  1970-01-01 07:35:12.283 +0700    0.0   6.685376  7.85  154.4954   
4  392392  1970-01-01 07:35:17.283 +0700    0.0   6.679331  7.85  154.5081   

                       createdAt                      updatedAt quality_label  
0  2025-02-17 16:13:52.755 +0700  2025-02-17 16:13:52.755 +0700     Tidak Ada  
1  2025-02-17 16:13:57.747 +0700  2025-02-17 16:13:57.747 +0700     Tidak Ada  
2  2025-02-17 16:14:07.523 +0700  2025-02-17 16:14:07.523 +0700     Tidak Ada  
3  2025-02-17 16:14:12.883 +0700  2025-02-17 16:14:12.883 +0700     Tidak Ada  
4  2025-02-17

Preprocessing HANDLE MISSING VALUES

In [3]:
# Panel A: flow1 memiliki missing values (108591)
# Strategi: Imputasi dengan forward fill (untuk time series) lalu backward fill
print(f"\nPanel A - Missing flow1 sebelum: {panel_a['flow1'].isna().sum()}")
panel_a['flow1'] = panel_a['flow1'].fillna(method='ffill').fillna(method='bfill')
print(f"Panel A - Missing flow1 sesudah: {panel_a['flow1'].isna().sum()}")

# Panel B: flow2 memiliki missing values (1562061 - sangat banyak!)
# Strategi: Imputasi dengan forward fill lalu backward fill, atau drop jika terlalu banyak
print(f"\nPanel B - Missing flow2 sebelum: {panel_b['flow2'].isna().sum()}")
panel_b['flow2'] = panel_b['flow2'].fillna(method='ffill').fillna(method='bfill')
# Jika masih ada NaN, isi dengan 0 (asumsi: tidak ada flow)
panel_b['flow2'] = panel_b['flow2'].fillna(0)
print(f"Panel B - Missing flow2 sesudah: {panel_b['flow2'].isna().sum()}")


Panel A - Missing flow1 sebelum: 108591
Panel A - Missing flow1 sesudah: 0

Panel B - Missing flow2 sebelum: 1562061
Panel B - Missing flow2 sesudah: 0


Fiture Selection

In [4]:
# Hapus label "Tidak Ada"
panel_a = panel_a[panel_a['quality_label'] != "Tidak Ada"]
panel_b = panel_b[panel_b['quality_label'] != "Tidak Ada"]

# Panel A: fitur yang relevan
features_a = ['flow1', 'turbidity', 'ph', 'tds']
X_a = panel_a[features_a]
y_a = panel_a['quality_label']

# Panel B: fitur yang relevan (ada flow2 tambahan)
features_b = ['flow1', 'turbidity', 'ph', 'tds', 'flow2']
X_b = panel_b[features_b]
y_b = panel_b['quality_label']

print(f"\nPanel A Features: {features_a}")
print(f"Panel B Features: {features_b}")



Panel A Features: ['flow1', 'turbidity', 'ph', 'tds']
Panel B Features: ['flow1', 'turbidity', 'ph', 'tds', 'flow2']


Label encoding

In [5]:
#label encoding
le_a = LabelEncoder()
y_a_encoded = le_a.fit_transform(y_a)
print(f"\nPanel A - Label classes: {le_a.classes_}")

le_b = LabelEncoder()
y_b_encoded = le_b.fit_transform(y_b)
print(f"Panel B - Label classes: {le_b.classes_}")



Panel A - Label classes: ['Biru' 'Coklat' 'Orange' 'Putih']
Panel B - Label classes: ['Biru' 'Coklat' 'Orange' 'Putih']


TRATIFIED SPLIT

In [6]:
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(
    X_a, y_a_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_a_encoded
)

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_b, y_b_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_b_encoded
)

print(f"\nPanel A - Train: {X_train_a.shape}, Test: {X_test_a.shape}")
print(f"Panel B - Train: {X_train_b.shape}, Test: {X_test_b.shape}")

# Cek distribusi label di train dan test
print(f"\nPanel A - Train labels: {np.unique(y_train_a, return_counts=True)}")
print(f"Panel A - Test labels: {np.unique(y_test_a, return_counts=True)}")
print(f"\nPanel B - Train labels: {np.unique(y_train_b, return_counts=True)}")
print(f"Panel B - Test labels: {np.unique(y_test_b, return_counts=True)}")



Panel A - Train: (385400, 4), Test: (96350, 4)
Panel B - Train: (1493848, 5), Test: (373462, 5)

Panel A - Train labels: (array([0, 1, 2, 3]), array([  8295, 136083,   4972, 236050], dtype=int64))
Panel A - Test labels: (array([0, 1, 2, 3]), array([ 2074, 34021,  1243, 59012], dtype=int64))

Panel B - Train labels: (array([0, 1, 2, 3]), array([  3731, 539233,   3786, 947098], dtype=int64))
Panel B - Test labels: (array([0, 1, 2, 3]), array([   933, 134808,    946, 236775], dtype=int64))


Features Scaling

In [None]:
# Scaling untuk Panel A
scaler_a = StandardScaler()
X_train_a_scaled = scaler_a.fit_transform(X_train_a)
X_test_a_scaled = scaler_a.transform(X_test_a)
print("Panel A - Features scaled")

# Scaling untuk Panel B
scaler_b = StandardScaler()
X_train_b_scaled = scaler_b.fit_transform(X_train_b)
X_test_b_scaled = scaler_b.transform(X_test_b)
print("Panel B - Features scaled")


Panel A - Features scaled
Panel B - Features scaled


Model training & Evaluation

In [14]:
def train_and_evaluate(X_train, y_train, X_test, y_test, label_encoder, panel_name, scaler):
    print("\n" + "="*60)
    print(f"TRAINING MODELS - {panel_name}")
    print("="*60)
    
    # Inisialisasi model dengan hyperparameter minimal
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced'),
        'XGBoost': XGBClassifier(random_state=42, n_estimators=100, use_label_encoder=False, eval_metric='mlogloss')
    }
    
    results = {}
    best_model_name = None
    best_test_accuracy = 0
    
    for name, model in models.items():
        print(f"\n{'='*50}")
        print(f"MODEL: {name}")
        print(f"{'='*50}")
        
        # Training
        model.fit(X_train, y_train)
        
        # Prediction - Training (hanya untuk accuracy)
        y_train_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        
        # Prediction - Testing
        y_test_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        # Confusion Matrix hanya untuk Test
        cm_test = confusion_matrix(y_test, y_test_pred)
        
        results[name] = {
            'model': model,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'cm_test': cm_test,
            'y_test_pred': y_test_pred
        }
        
        # Track best model
        if test_accuracy > best_test_accuracy:
            best_test_accuracy = test_accuracy
            best_model_name = name
        
        print(f"\nTrain Accuracy: {train_accuracy:.4f}")
        print(f"Test Accuracy:  {test_accuracy:.4f}")
        print(f"\nTest Confusion Matrix:\n{cm_test}")
        print(f"\nClassification Report (Test):")
        print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))
        
        # Visualisasi Confusion Matrix hanya untuk Test
        fig, ax = plt.subplots(figsize=(8, 6))
        fig.suptitle(f'{name} - {panel_name}', fontsize=16, fontweight='bold')
        sns.heatmap(cm_test, annot=True, fmt='d', cmap='Greens', 
                    xticklabels=label_encoder.classes_, 
                    yticklabels=label_encoder.classes_,
                    ax=ax)
        ax.set_title(f"Test Confusion Matrix\nAccuracy: {test_accuracy:.4f}")
        ax.set_ylabel('True Label')
        ax.set_xlabel('Predicted Label')
        
        plt.tight_layout()
        cm_filename = f'cm_test_{panel_name.replace(" ", "_")}_{name.replace(" ", "_")}.png'
        plt.savefig(cm_filename, dpi=300, bbox_inches='tight')
        print(f"\n✓ Confusion matrix saved: {cm_filename}")
        plt.close()
        
        # Simpan model untuk tiap model
        model_filename = f'model_{panel_name.replace(" ", "_")}_{name.replace(" ", "_")}.pkl'
        joblib.dump(model, model_filename)
        print(f"✓ Model saved: {model_filename}")
    
    # Summary
    print("\n" + "="*60)
    print(f"SUMMARY - {panel_name}")
    print("="*60)
    print(f"\n{'Model':<25} {'Train Acc':<15} {'Test Acc':<15}")
    print("-" * 55)
    for name, result in results.items():
        marker = " ← BEST" if name == best_model_name else ""
        print(f"{name:<25} {result['train_accuracy']:<15.4f} {result['test_accuracy']:<15.4f}{marker}")
    
    # Save Best Model, Scaler, dan Label Encoder
    best_model = results[best_model_name]['model']
    best_model_filename = f'best_model_{panel_name.replace(" ", "_")}.pkl'
    scaler_filename = f'scaler_{panel_name.replace(" ", "_")}.pkl'
    label_encoder_filename = f'label_encoder_{panel_name.replace(" ", "_")}.pkl'
    
    joblib.dump(best_model, best_model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(label_encoder, label_encoder_filename)
    
    print(f"\n{'='*60}")
    print(f"BEST MODEL SAVED - {panel_name}")
    print(f"{'='*60}")
    print(f"✓ Best Model: {best_model_name}")
    print(f"✓ Test Accuracy: {best_test_accuracy:.4f}")
    print(f"\nFiles saved:")
    print(f"  - {best_model_filename}")
    print(f"  - {scaler_filename}")
    print(f"  - {label_encoder_filename}")
    
    return results, best_model_name, best_test_accuracy

# Training untuk Panel A
results_a, best_model_a, best_acc_a = train_and_evaluate(
    X_train_a_scaled, y_train_a, 
    X_test_a_scaled, y_test_a, 
    le_a, "Panel A", scaler_a
)

# Training untuk Panel B
results_b, best_model_b, best_acc_b = train_and_evaluate(
    X_train_b_scaled, y_train_b, 
    X_test_b_scaled, y_test_b, 
    le_b, "Panel B", scaler_b
)

print("\n" + "="*60)
print("TRAINING SELESAI!")
print("="*60)
print(f"\nPanel A Best Model: {best_model_a} (Accuracy: {best_acc_a:.4f})")
print(f"Panel B Best Model: {best_model_b} (Accuracy: {best_acc_b:.4f})")


TRAINING MODELS - Panel A

MODEL: Logistic Regression

Train Accuracy: 0.9032
Test Accuracy:  0.9017

Test Confusion Matrix:
[[ 1810   239    25     0]
 [ 3199 26828   129  3865]
 [  100    30  1099    14]
 [    0     0  1867 57145]]

Classification Report (Test):
              precision    recall  f1-score   support

        Biru       0.35      0.87      0.50      2074
      Coklat       0.99      0.79      0.88     34021
      Orange       0.35      0.88      0.50      1243
       Putih       0.94      0.97      0.95     59012

    accuracy                           0.90     96350
   macro avg       0.66      0.88      0.71     96350
weighted avg       0.94      0.90      0.91     96350


✓ Confusion matrix saved: cm_test_Panel_A_Logistic_Regression.png
✓ Model saved: model_Panel_A_Logistic_Regression.pkl

MODEL: Random Forest

Train Accuracy: 1.0000
Test Accuracy:  1.0000

Test Confusion Matrix:
[[ 2074     0     0     0]
 [    0 34021     0     0]
 [    0     0  1243     0]
 [   