In [12]:
# %% [markdown]
# # Model Evaluation - ripenessVision
# 
# Evaluasi mendalam model yang sudah ditraining

# %%
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle
import cv2


In [13]:
# %%
# Setup
BASE_DIR = Path('..')
MODELS_DIR = BASE_DIR / 'models'
RESULTS_DIR = BASE_DIR / 'results'
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'

# %%
# Load model and data
model = keras.models.load_model(str(MODELS_DIR / 'best_model.h5'))

test_df = pd.read_csv(PROCESSED_DIR / 'test_metadata.csv')

with open(PROCESSED_DIR / 'data_splits.json', 'r') as f:
    split_info = json.load(f)

classes = split_info['classes']

In [14]:
# %%
# Load model and data dengan error handling
try:
    # Load model dengan compile=False untuk menghindari warning
    model = keras.models.load_model(
        str(MODELS_DIR / 'best_model.h5'), 
        compile=False  # TAMBAH INI
    )
    
    # Recompile model setelah load
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    print("✅ Model loaded and recompiled successfully!")
    
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Trying alternative loading method...")
    
    # Alternative loading method
    try:
        model = keras.models.load_model(str(MODELS_DIR / 'best_model.h5'))
        print("✅ Model loaded with default method!")
    except:
        print("❌ Still failed. Checking available models...")
        
        # List semua model yang available
        model_files = list(MODELS_DIR.glob('*.h5'))
        print("Available model files:")
        for model_file in model_files:
            print(f"  - {model_file.name}")
        
        # Load model lain jika best_model tidak ada
        if model_files:
            model = keras.models.load_model(str(model_files[0]))
            print(f"✅ Loaded alternative model: {model_files[0].name}")
        else:
            raise FileNotFoundError("No model files found!")

# Load test data
try:
    test_df = pd.read_csv(PROCESSED_DIR / 'test_metadata.csv')
    print(f"✅ Test data loaded: {len(test_df)} samples")
except:
    print("❌ test_metadata.csv not found, trying filtered version...")
    test_df = pd.read_csv(PROCESSED_DIR / 'test_metadata_filtered.csv')
    print(f"✅ Filtered test data loaded: {len(test_df)} samples")

# Load class information
try:
    with open(PROCESSED_DIR / 'data_splits.json', 'r') as f:
        split_info = json.load(f)
    classes = split_info['classes']
    print(f"✅ Classes loaded: {classes}")
except:
    print("❌ data_splits.json not found, extracting from dataframe...")
    classes = sorted(test_df['class'].unique().tolist())
    print(f"✅ Classes extracted from dataframe: {classes}")

# Verifikasi model dan data compatibility
print(f"\n📊 MODEL-DATA COMPATIBILITY CHECK:")
print(f"Model input shape: {model.input_shape}")
print(f"Model output shape: {model.output_shape}")
print(f"Number of classes in data: {len(classes)}")
print(f"Model output classes: {model.output_shape[1]}")

if model.output_shape[1] != len(classes):
    print("⚠️  WARNING: Model output dimension doesn't match number of classes!")
    print("This might cause issues during evaluation.")
else:
    print("✅ Model and data classes are compatible!")

✅ Model loaded and recompiled successfully!
❌ test_metadata.csv not found, trying filtered version...


FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\processed\\test_metadata_filtered.csv'

In [None]:
# %%
# Create test data generator
test_datagen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    test_df,
    x_col='file_path',
    y_col='class',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    shuffle=False,
    classes=classes
)


Found 0 validated image filenames belonging to 9 classes.




In [None]:
# %%
# Comprehensive evaluation dengan error handling yang diperbaiki
print("=== COMPREHENSIVE MODEL EVALUATION ===")

def safe_evaluate_model(model, test_df, classes, batch_size=16):
    """
    Evaluate model dengan safety mechanism - FIXED VERSION
    """
    print("🔍 Filtering test data for corrupt images...")
    
    # Filter test data untuk hanya include valid images
    valid_test_indices = []
    for idx, row in test_df.iterrows():
        try:
            with Image.open(row['file_path']) as img:
                img.verify()
            if os.path.getsize(row['file_path']) > 0:
                valid_test_indices.append(idx)
        except:
            print(f"Removing corrupt test image: {row['file_path']}")
            continue
    
    valid_test_df = test_df.loc[valid_test_indices]
    print(f"✅ Valid test images: {len(valid_test_df)}/{len(test_df)}")
    
    # CHECK: Pastikan kolom 'class' ada di DataFrame
    print("📋 Checking DataFrame columns...")
    print(f"Available columns: {valid_test_df.columns.tolist()}")
    
    # Jika kolom 'class' tidak ada, coba kolom alternatif
    if 'class' not in valid_test_df.columns:
        print("⚠️  Column 'class' not found. Looking for alternative columns...")
        
        # Coba kolom alternatif yang mungkin ada
        possible_class_columns = ['class', 'label', 'category', 'ripeness_class']
        class_column_found = None
        
        for col in possible_class_columns:
            if col in valid_test_df.columns:
                class_column_found = col
                print(f"✅ Found alternative class column: '{col}'")
                break
        
        if class_column_found is None:
            # Jika tidak ada kolom class, buat dari kombinasi fruit dan ripeness
            print("🔄 Creating 'class' column from fruit and ripeness...")
            if 'fruit' in valid_test_df.columns and 'ripeness' in valid_test_df.columns:
                valid_test_df['class'] = valid_test_df['fruit'] + '_' + valid_test_df['ripeness']
                class_column_found = 'class'
                print("✅ Created 'class' column successfully")
            else:
                raise KeyError("No class column found and cannot create one from available columns")
    else:
        class_column_found = 'class'
    
    # Create safe test generator dengan kolom yang benar
    test_datagen = ImageDataGenerator(rescale=1./255)
    
    print(f"📊 Creating generator with class column: '{class_column_found}'")
    
    test_generator = test_datagen.flow_from_dataframe(
        valid_test_df,
        x_col='file_path',
        y_col=class_column_found,  # Gunakan kolom yang ditemukan
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False,
        classes=classes
    )
    
    # Evaluate model
    print("📊 Evaluating model...")
    if len(valid_test_df) > 0:
        test_loss, test_accuracy = model.evaluate(test_generator, verbose=1)
    else:
        print("❌ No valid test images available for evaluation!")
        test_loss, test_accuracy = 0, 0
    
    return test_loss, test_accuracy, valid_test_df

# Jalankan safe evaluation
try:
    test_loss, test_accuracy, valid_test_df = safe_evaluate_model(model, test_df, classes)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Loss: {test_loss:.4f}")
except Exception as e:
    print(f"❌ Evaluation failed: {e}")
    print("Trying alternative evaluation method...")

=== COMPREHENSIVE MODEL EVALUATION ===
🔍 Filtering test data for corrupt images...
Removing corrupt test image: ..\data\raw\mango\unripe\unripe_mango_high_quality_photo_86.jpg
Removing corrupt test image: ..\data\raw\tomato\overripe\overripe_tomato_high_quality_photo_75.jpg
Removing corrupt test image: ..\data\raw\banana\overripe\overripe_banana_high_quality_photo_58.jpg
Removing corrupt test image: ..\data\raw\tomato\overripe\overripe_tomato_high_quality_photo_20.jpg
Removing corrupt test image: ..\data\raw\mango\unripe\unripe_mango_high_quality_photo_30.jpg
Removing corrupt test image: ..\data\raw\mango\unripe\unripe_mango_high_quality_photo_23.jpg
Removing corrupt test image: ..\data\raw\tomato\overripe\overripe_tomato_high_quality_photo_31.jpg
Removing corrupt test image: ..\data\raw\banana\unripe\unripe_banana_high_quality_photo_0.jpg
Removing corrupt test image: ..\data\raw\mango\unripe\unripe_mango_high_quality_photo_62.jpg
Removing corrupt test image: ..\data\raw\mango\ripe\rip

In [None]:
# %%
# Predictions
y_true = test_generator.classes
y_pred_proba = model.predict(test_generator)
y_pred = np.argmax(y_pred_proba, axis=1)

  self._warn_if_super_not_called()


ValueError: The PyDataset has length 0

In [None]:
# %%
# Detailed classification report
print("Detailed Classification Report:")
print(classification_report(y_true, y_pred, target_names=classes))

In [None]:
# %%
# Enhanced confusion matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=classes, yticklabels=classes,
            cbar_kws={'shrink': 0.8})
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# %%
# Normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(12, 10))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=classes, yticklabels=classes,
            cbar_kws={'shrink': 0.8})
plt.title('Normalized Confusion Matrix', fontsize=16, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# %%
# ROC Curve for multiclass
def plot_multiclass_roc(y_true, y_pred_proba, classes):
    # Binarize the output
    y_true_bin = label_binarize(y_true, classes=range(len(classes)))
    
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Plot all ROC curves
    plt.figure(figsize=(10, 8))
    colors = cycle(['blue', 'red', 'green', 'yellow', 'purple', 'orange', 'pink', 'brown', 'gray'])
    
    for i, color in zip(range(len(classes)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                label='ROC curve of class {0} (area = {1:0.2f})'
                ''.format(classes[i], roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multiclass ROC Curve')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()
    
    return roc_auc

In [None]:
# %%
# Plot ROC curve
roc_auc = plot_multiclass_roc(y_true, y_pred_proba, classes)
print("AUC Scores for each class:")
for i, class_name in enumerate(classes):
    print(f"  {class_name}: {roc_auc[i]:.4f}")


In [None]:
# %%
# Per-class accuracy
class_accuracy = {}
for i, class_name in enumerate(classes):
    class_mask = y_true == i
    if np.sum(class_mask) > 0:
        class_acc = np.mean(y_pred[class_mask] == y_true[class_mask])
        class_accuracy[class_name] = class_acc

In [None]:
# %%
# Plot per-class accuracy
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(class_accuracy)), list(class_accuracy.values()), 
               color='skyblue', edgecolor='black')
plt.xticks(range(len(class_accuracy)), list(class_accuracy.keys()), rotation=45)
plt.ylabel('Accuracy')
plt.title('Per-Class Accuracy')
plt.grid(True, alpha=0.3)

# Add value labels on bars
for bar, acc in zip(bars, class_accuracy.values()):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
            f'{acc:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# %%
# Error analysis: misclassified samples
misclassified_mask = y_pred != y_true
misclassified_indices = np.where(misclassified_mask)[0]

print(f"Total misclassified samples: {len(misclassified_indices)}")
print(f"Error rate: {len(misclassified_indices)/len(y_true):.4f}")

In [None]:
# %%
# Analyze misclassifications
misclassification_analysis = []
for idx in misclassified_indices[:20]:  # Show first 20
    true_class = classes[y_true[idx]]
    pred_class = classes[y_pred[idx]]
    confidence = np.max(y_pred_proba[idx])
    
    misclassification_analysis.append({
        'true_class': true_class,
        'pred_class': pred_class,
        'confidence': confidence,
        'file_path': test_df.iloc[idx]['file_path']
    })

misclassification_df = pd.DataFrame(misclassification_analysis)
print("Sample misclassifications:")
print(misclassification_df.head(10))

In [None]:
# %%
# Visualize misclassified samples
def visualize_misclassifications(misclassification_df, num_samples=8):
    fig, axes = plt.subplots(2, 4, figsize=(15, 8))
    axes = axes.ravel()
    
    for i in range(min(num_samples, len(misclassification_df))):
        sample = misclassification_df.iloc[i]
        img = keras.preprocessing.image.load_img(sample['file_path'])
        img_array = keras.preprocessing.image.img_to_array(img)
        
        axes[i].imshow(img_array / 255.0)
        axes[i].set_title(f"True: {sample['true_class']}\nPred: {sample['pred_class']}\nConf: {sample['confidence']:.3f}")
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_misclassifications(misclassification_df)

In [None]:
# %%
# Confidence analysis
correct_confidences = []
incorrect_confidences = []

for i in range(len(y_true)):
    confidence = np.max(y_pred_proba[i])
    if y_pred[i] == y_true[i]:
        correct_confidences.append(confidence)
    else:
        incorrect_confidences.append(confidence)


In [None]:
# %%
# Plot confidence distributions
plt.figure(figsize=(10, 6))
plt.hist(correct_confidences, bins=30, alpha=0.7, label='Correct', color='green')
plt.hist(incorrect_confidences, bins=30, alpha=0.7, label='Incorrect', color='red')
plt.xlabel('Prediction Confidence')
plt.ylabel('Frequency')
plt.title('Prediction Confidence Distribution')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Average confidence - Correct: {np.mean(correct_confidences):.4f}")
print(f"Average confidence - Incorrect: {np.mean(incorrect_confidences):.4f}")

In [None]:
# %%
# Save evaluation results
evaluation_results = {
    'test_accuracy': float(test_accuracy),
    'test_loss': float(test_loss),
    'per_class_accuracy': class_accuracy,
    'auc_scores': {classes[i]: float(roc_auc[i]) for i in range(len(classes))},
    'confusion_matrix': cm.tolist(),
    'misclassification_count': int(len(misclassified_indices)),
    'error_rate': float(len(misclassified_indices)/len(y_true))
}

with open(RESULTS_DIR / 'evaluation_results.json', 'w') as f:
    json.dump(evaluation_results, f, indent=2)

# Save misclassification analysis
misclassification_df.to_csv(RESULTS_DIR / 'misclassification_analysis.csv', index=False)

print("Evaluation completed! Results saved.")