In [154]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import label_binarize
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [155]:
def train_random_forest(features_file, output_folder):
    # Ensure features_file and output_folder exist
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"File not found: {features_file}")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load data
    df = pd.read_csv(features_file)
    X = df.drop(columns=["Tree", "Apple", "Variety", "Augmented", "Mean_Color_0", "Mean_Color_1", "Mean_Color_2", "Std_Color_0", "Std_Color_1", "Std_Color_2", "Color_Range_0", "Color_Range_1", "Color_Range_2"])
    y = df["Variety"]

    # Split data
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.3, stratify=y_train_full, random_state=42
    )

    # Random Forest hyperparameter grid
    param_grid = {
        "n_estimators": [50, 100, 200, 400, 600, 800],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2", None],
    }

    # GridSearchCV
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Best model and parameters
    best_rf = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    # Validation Accuracy
    val_accuracy = accuracy_score(y_val, best_rf.predict(X_val))
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    # Cross-Validation Scores
    cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring="accuracy")
    cross_val_mean = np.mean(cross_val_scores)
    cross_val_std = np.std(cross_val_scores)
    print(f"Cross-Validation Accuracy: Mean = {cross_val_mean:.4f}, Std = {cross_val_std:.4f}")

    # Test Metrics
    y_test_pred = best_rf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average="weighted")
    recall = recall_score(y_test, y_test_pred, average="weighted")
    f1 = f1_score(y_test, y_test_pred, average="weighted")
    # Print Test Metrics
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

    # Save results
    results = pd.DataFrame(grid_search.cv_results_)
    results_file = f"{output_folder}/random_forest_results.csv"
    results.to_csv(results_file, index=False)

    # Generate plots
    plot_feature_importance(best_rf, X.columns, output_folder)
    plot_confusion_matrix(y_test, y_test_pred, output_folder)
    plot_roc(
    model=best_rf,
    X_test=X_test,
    y_test=y_test,
    class_names=y.unique(),
    output_folder=output_folder,
    )
    plot_line_plot(results, output_folder, param="param_max_features")
    plot_n_estimators(results, output_folder)


In [None]:
def plot_feature_importance(model, feature_names, output_folder):
    """
    Generate and save feature importance plot.
    """
    feature_importances = model.feature_importances_
    importance_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": feature_importances
    }).sort_values(by="Importance", ascending=False)
    importance_df.to_csv(f"{output_folder}/random_forest_feature_importance.csv", index=False)

    # Plot
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df["Feature"], importance_df["Importance"])
    plt.gca().invert_yaxis()
    plt.title("Feature Importance (Random Forest)")
    plt.xlabel("Importance")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/random_forest_feature_importance.png")
    plt.close()

def plot_confusion_matrix(y_test, y_test_pred, output_folder):
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(18, 14))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
    plt.title("RF Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(f"{output_folder}/rf_confusion_matrix.png", dpi=300)
    plt.close()

def plot_roc_curves(model, X_test, y_test, class_names, output_folder):
    """
    Generate and save ROC curves plot.
    """
    y_test_binarized = label_binarize(y_test, classes=class_names)
    y_pred_proba = model.predict_proba(X_test)
    n_classes = len(class_names)

    plt.figure(figsize=(10, 8))
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"Class {class_names[i]} (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], "k--", lw=2)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves")
    plt.legend(loc="best")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/roc_curves.png")
    plt.close()


def plot_roc(model, X_test, y_test, class_names, output_folder):
    """
    Generate and save a single ROC curves plot for the top 5 and bottom 5 classes based on AUC.

    Parameters:
    - model: Trained RandomForestClassifier.
    - X_test: Test set features.
    - y_test: Test set true labels.
    - class_names: List of class names.
    - output_folder: Folder to save the plot.
    """
    # Binarize test labels for ROC curve computation
    y_test_binarized = label_binarize(y_test, classes=class_names)
    y_pred_proba = model.predict_proba(X_test)
    n_classes = len(class_names)

    # Compute AUC for each class
    roc_data = []
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        roc_data.append((class_names[i], fpr, tpr, roc_auc))

    # Sort classes by AUC
    roc_data_sorted = sorted(roc_data, key=lambda x: x[3], reverse=True)
    best_roc_data = roc_data_sorted[:5]  # Top 5 classes
    worst_roc_data = roc_data_sorted[-5:]  # Bottom 5 classes

    # Plot both top 5 and bottom 5 ROC curves
    plt.figure(figsize=(12, 8))
    
    # Plot top 5 curves
    for class_name, fpr, tpr, roc_auc in best_roc_data:
        plt.plot(fpr, tpr, lw=2, label=f"Top 5: {class_name} (AUC = {roc_auc:.2f})")
    
    # Plot bottom 5 curves
    for class_name, fpr, tpr, roc_auc in worst_roc_data:
        plt.plot(fpr, tpr, lw=2, linestyle='--', label=f"Worst 5: {class_name} (AUC = {roc_auc:.2f})")
    
    # Plot diagonal for random classifier
    plt.plot([0, 1], [0, 1], "k--", lw=2, label="Random Classifier")

    # Add plot details
    plt.title("Top 5 and Bottom 5 ROC Curves")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="best", fontsize="small")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/top_and_bottom_5_roc_curves.png")
    plt.close()


def plot_n_estimators(results, output_folder, metric="mean_test_score"):
    """
    Generate and save a plot showing the effect of n_estimators on the given metric.

    Parameters:
    - results: GridSearchCV results as a DataFrame.
    - output_folder: Directory where the plot will be saved.
    - metric: Metric to plot (default: "mean_test_score").
    """
    # Filter results to focus only on n_estimators
    n_estimators_results = results[["param_n_estimators", metric]].copy()
    n_estimators_results = n_estimators_results.groupby("param_n_estimators")[metric].mean().reset_index()

    # Plot using Matplotlib
    plt.figure(figsize=(8, 5))
    plt.plot(n_estimators_results["param_n_estimators"], n_estimators_results[metric], marker="o", linestyle="-", color="b")
    plt.title(f"Effect of n_estimators on {metric}")
    plt.xlabel("n_estimators")
    plt.ylabel(metric)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"{output_folder}/n_estimators_{metric}.png")
    plt.close()

In [157]:
train_random_forest(
        features_file="C:/Daten/PA2/Code/Output/apple_features_balanced_FE_Final_REVISED_1012.csv",
        output_folder="C:/Daten/PA2/Code/Output/ModelComparison_RF_1112_NoColor_n800")

Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 800}
Validation Accuracy: 0.8947
Cross-Validation Accuracy: Mean = 0.8534, Std = 0.0202
Test Accuracy: 0.8798
Test Precision: 0.8853
Test Recall: 0.8798
Test F1 Score: 0.8791


Results without Color:

Best Parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}

Validation Accuracy: 0.8903

Cross-Validation Accuracy: Mean = 0.8466, Std = 0.0251

Test Accuracy: 0.8737

Test Precision: 0.8795

Test Recall: 0.8737

Test F1 Score: 0.8732

In [158]:
def train_random_forest_color(features_file, output_folder):
    # Ensure features_file and output_folder exist
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"File not found: {features_file}")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load data
    df = pd.read_csv(features_file)
    X = df.drop(columns=["Tree", "Apple", "Variety", "Augmented"])
    y = df["Variety"]

    # Split data
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.3, stratify=y_train_full, random_state=42
    )

    # Random Forest hyperparameter grid
    param_grid = {
        "n_estimators": [50, 100, 200, 400, 600, 800],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2", None],
    }

    # GridSearchCV
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Best model and parameters
    best_rf = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    # Validation Accuracy
    val_accuracy = accuracy_score(y_val, best_rf.predict(X_val))
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    # Cross-Validation Scores
    cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring="accuracy")
    cross_val_mean = np.mean(cross_val_scores)
    cross_val_std = np.std(cross_val_scores)
    print(f"Cross-Validation Accuracy: Mean = {cross_val_mean:.4f}, Std = {cross_val_std:.4f}")

    # Test Metrics
    y_test_pred = best_rf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average="weighted")
    recall = recall_score(y_test, y_test_pred, average="weighted")
    f1 = f1_score(y_test, y_test_pred, average="weighted")
    classification_report_str = classification_report(y_test, y_test_pred, output_dict=False)

    # Print Test Metrics
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report_str)

    # Save results
    results = pd.DataFrame(grid_search.cv_results_)
    results_file = f"{output_folder}/random_forest_results.csv"
    results.to_csv(results_file, index=False)

    # Generate plots
    plot_feature_importance(best_rf, X.columns, output_folder)
    plot_confusion_matrix(y_test, y_test_pred, output_folder)
    plot_roc_curves_top_5(
    model=best_rf,
    X_test=X_test,
    y_test=y_test,
    class_names=y.unique(),
    output_folder=output_folder,
    param_label=best_params
    )
    plot_line_plot(results, output_folder, param="param_max_features")
    plot_n_estimators(results, output_folder)


In [159]:
train_random_forest_color(
        features_file="C:/Daten/PA2/Code/Output/apple_features_balanced_FE_Final_REVISED_1012.csv",
        output_folder="C:/Daten/PA2/Code/Output/ModelComparison_RF_1112_WithColor_n800")

Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Validation Accuracy: 0.9481
Cross-Validation Accuracy: Mean = 0.9604, Std = 0.0106
Test Accuracy: 0.9556
Test Precision: 0.9573
Test Recall: 0.9556
Test F1 Score: 0.9552

Classification Report:
               precision    recall  f1-score   support

 14-001-1296       1.00      0.83      0.91        30
 14-001-1300       1.00      1.00      1.00        30
 14-001-1353       0.91      0.97      0.94        30
 14-001-1496       1.00      1.00      1.00        30
 14-001-1504       0.88      0.73      0.80        30
 14-001-1516       0.97      0.97      0.97        30
 14-001-1529       1.00      1.00      1.00        30
 14-001-1572       0.92      0.80      0.86        30
 14-001-1579       1.00      1.00      1.00        30
 14-001-1873       0.93      0.87      0.90        30
 14-001-1878       0.97      0.93      0.95        30
 14-001-1953       1.00      

Results with Color:

Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Validation Accuracy: 0.9637

Cross-Validation Accuracy: Mean = 0.9383, Std = 0.0140

Test Accuracy: 0.9546

Test Precision: 0.9586

Test Recall: 0.9546

Test F1 Score: 0.9539