In [161]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    auc
)
from sklearn.preprocessing import label_binarize


In [162]:
def train_svm(features_file, output_folder):
    # Ensure the input file and output directory exist
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"File not found: {features_file}")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load dataset
    df = pd.read_csv(features_file)
    X = df.drop(columns=["Tree", "Apple", "Variety", "Augmented", "Mean_Color_0", "Mean_Color_1", "Mean_Color_2", "Std_Color_0", "Std_Color_1", "Std_Color_2", "Color_Range_0", "Color_Range_1", "Color_Range_2"])
    y = df["Variety"]

    # Split data into train, validation, and test sets
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.3, stratify=y_train_full, random_state=42
    )  # Validation set is 20% of training data

    # SVM hyperparameter grid
    param_grid = {
        "C": [0.01, 0.1, 1, 10, 20, 30, 40, 50, 100, 200, 400,600, 800, 1000],
        "kernel": ["linear", "rbf", "poly"],
        "gamma": ["scale", "auto"],
    }

    # Perform GridSearchCV with Cross-Validation on Training Data
    grid_search = GridSearchCV(
        SVC(random_state=42, probability=True),
        param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Evaluate on Validation Set
    best_svm = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    y_val_pred = best_svm.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy}")

    # Cross-Validation Score
    cross_val_scores = cross_val_score(best_svm, X_train, y_train, cv=5, scoring="accuracy")
    cross_val_mean = np.mean(cross_val_scores)
    cross_val_std = np.std(cross_val_scores)
    print(f"Cross-Validation Accuracy: Mean = {cross_val_mean}, Std = {cross_val_std}")

    # Evaluate on Test Set
    y_test_pred = best_svm.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average="weighted")
    recall = recall_score(y_test, y_test_pred, average="weighted")
    f1 = f1_score(y_test, y_test_pred, average="weighted")
    classification_report_str = classification_report(y_test, y_test_pred, output_dict=False)

    print(f"Test Accuracy: {accuracy:.3f}")
    print(f"Test Precision: {precision:.3f}")
    print(f"Test Recall: {recall:.3f}")
    print(f"Test F1 Score: {f1:.3f}")
    print("\nClassification Report:\n", classification_report_str)

    # Save results
    results = pd.DataFrame(grid_search.cv_results_)
    results_file = f"{output_folder}/random_forest_results.csv"
    results.to_csv(results_file, index=False)

    # Generate plots
    plot_confusion_matrix(y_test, y_test_pred, output_folder)
    plot_svm_metrics(accuracy, precision, recall, f1, output_folder)
    plot_svm_hyperparameter_results(grid_search.cv_results_, output_folder)
    plot_roc_curves_combined(best_svm, X_test, y_test, y.unique(), output_folder)

    print(f"All SVM metrics and visualizations saved to {output_folder}.")





In [163]:
def plot_confusion_matrix(y_test, y_test_pred, output_folder):
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(18, 14))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
    plt.title("SVM Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/svm_confusion_matrix.png", dpi=300)
    plt.close()


def plot_svm_metrics(accuracy, precision, recall, f1, output_folder):
    metrics = {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}
    plt.figure(figsize=(8, 5))
    plt.bar(metrics.keys(), metrics.values(), color="skyblue")
    plt.title("SVM Metrics")
    plt.ylim(0, 1)
    plt.ylabel("Score")
    plt.savefig(f"{output_folder}/svm_metrics.png")
    plt.close()


def plot_svm_hyperparameter_results(cv_results, output_folder):
    results = pd.DataFrame(cv_results)
    plt.figure(figsize=(10, 6))
    for kernel in results["param_kernel"].unique():
        kernel_data = results[results["param_kernel"] == kernel]
        plt.plot(kernel_data["param_C"], kernel_data["mean_test_score"], label=kernel, marker="o", linestyle="-")
    plt.xscale("log")
    plt.xlabel("C")
    plt.ylabel("Mean Test Accuracy")
    plt.title("SVM Accuracy vs. Hyperparameters")
    plt.legend(title="Kernel")
    plt.grid(True)
    plt.savefig(f"{output_folder}/svm_accuracy_vs_hyperparameters.png")
    plt.close()


def plot_roc_curves_combined(model, X_test, y_test, class_names, output_folder):
    y_test_binarized = label_binarize(y_test, classes=class_names)
    y_pred_proba = model.predict_proba(X_test)
    roc_data = []
    for i, class_name in enumerate(class_names):
        fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_pred_proba[:, i])
        roc_auc = auc(fpr, tpr)
        roc_data.append((class_name, fpr, tpr, roc_auc))

    roc_data_sorted = sorted(roc_data, key=lambda x: x[3], reverse=True)
    best_roc_data = roc_data_sorted[:5]
    worst_roc_data = roc_data_sorted[-5:]

    plt.figure(figsize=(12, 8))
    for class_name, fpr, tpr, roc_auc in best_roc_data:
        plt.plot(fpr, tpr, lw=2, label=f"Top 5: {class_name} (AUC = {roc_auc:.2f})")
    for class_name, fpr, tpr, roc_auc in worst_roc_data:
        plt.plot(fpr, tpr, lw=2, linestyle="--", label=f"Worst 5: {class_name} (AUC = {roc_auc:.2f})")

    plt.plot([0, 1], [0, 1], "k--", lw=2, label="Random Classifier")
    plt.title("Top 5 and Worst 5 ROC Curves")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="best", fontsize="small")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/svm_top_and_worst_roc_curves.png")
    plt.close()

In [164]:
train_svm(
        features_file="C:/Daten/PA2/Code/Output/apple_features_balanced_FE_Final_REVISED_1012.csv",
        output_folder="C:/Daten/PA2/Code/Output/ModelComparison_1112_NoColor"
    )

Best Parameters: {'C': 200, 'gamma': 'scale', 'kernel': 'rbf'}
Validation Accuracy: 0.9321789321789322
Cross-Validation Accuracy: Mean = 0.8726044413866912, Std = 0.010947237302930683
Test Accuracy: 0.926
Test Precision: 0.929
Test Recall: 0.926
Test F1 Score: 0.925

Classification Report:
               precision    recall  f1-score   support

 14-001-1296       0.93      0.90      0.92        30
 14-001-1300       0.94      1.00      0.97        30
 14-001-1353       0.94      0.97      0.95        30
 14-001-1496       0.93      0.90      0.92        30
 14-001-1504       0.91      1.00      0.95        30
 14-001-1516       0.94      1.00      0.97        30
 14-001-1529       1.00      1.00      1.00        30
 14-001-1572       0.88      0.77      0.82        30
 14-001-1579       0.88      0.97      0.92        30
 14-001-1873       0.90      0.87      0.88        30
 14-001-1878       1.00      0.87      0.93        30
 14-001-1953       1.00      1.00      1.00        30
 14-0

Results WITHOUT Color:

Best Parameters: {'C': 200, 'gamma': 'scale', 'kernel': 'rbf'}

Validation Accuracy: 0.9321789321789322

Cross-Validation Accuracy: Mean = 0.8726044413866912, Std = 0.010947237302930683

Test Accuracy: 0.926

Test Precision: 0.929

Test Recall: 0.926

Test F1 Score: 0.925

In [165]:
def train_svm_color(features_file, output_folder):
    # Ensure the input file and output directory exist
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"File not found: {features_file}")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load dataset
    df = pd.read_csv(features_file)
    X = df.drop(columns=["Tree", "Apple", "Variety", "Augmented"])
    y = df["Variety"]

    # Split data into train, validation, and test sets
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.3, stratify=y_train_full, random_state=42
    )  # Validation set is 20% of training data

    # SVM hyperparameter grid
    param_grid = {
        "C": [0.01, 0.1, 1, 10, 20, 30, 40, 50, 100, 200, 400,600, 800, 1000],
        "kernel": ["linear", "rbf", "poly"],
        "gamma": ["scale", "auto"],
    }

    # Perform GridSearchCV with Cross-Validation on Training Data
    grid_search = GridSearchCV(
        SVC(random_state=42, probability=True),
        param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Evaluate on Validation Set
    best_svm = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    y_val_pred = best_svm.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy}")

    # Cross-Validation Score
    cross_val_scores = cross_val_score(best_svm, X_train, y_train, cv=5, scoring="accuracy")
    cross_val_mean = np.mean(cross_val_scores)
    cross_val_std = np.std(cross_val_scores)
    print(f"Cross-Validation Accuracy: Mean = {cross_val_mean}, Std = {cross_val_std}")

    # Evaluate on Test Set
    y_test_pred = best_svm.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average="weighted")
    recall = recall_score(y_test, y_test_pred, average="weighted")
    f1 = f1_score(y_test, y_test_pred, average="weighted")
    classification_report_str = classification_report(y_test, y_test_pred, output_dict=False)

    print(f"Test Accuracy: {accuracy:.3f}")
    print(f"Test Precision: {precision:.3f}")
    print(f"Test Recall: {recall:.3f}")
    print(f"Test F1 Score: {f1:.3f}")
    print("\nClassification Report:\n", classification_report_str)

    # Save results
    results = pd.DataFrame(grid_search.cv_results_)
    results_file = f"{output_folder}/random_forest_results.csv"
    results.to_csv(results_file, index=False)

    # Generate plots
    plot_confusion_matrix(y_test, y_test_pred, output_folder)
    plot_svm_metrics(accuracy, precision, recall, f1, output_folder)
    plot_svm_hyperparameter_results(grid_search.cv_results_, output_folder)
    plot_roc_curves_combined(best_svm, X_test, y_test, y.unique(), output_folder)

    print(f"All SVM metrics and visualizations saved to {output_folder}.")





In [166]:
train_svm_color(features_file="C:/Daten/PA2/Code/Output/apple_features_balanced_FE_Final_REVISED_1012.csv",
                output_folder="C:/Daten/PA2/Code/Output/ModelComparison_1112_WithColor")

Best Parameters: {'C': 20, 'gamma': 'scale', 'kernel': 'rbf'}
Validation Accuracy: 0.976911976911977
Cross-Validation Accuracy: Mean = 0.9653690326032948, Std = 0.012287226178542632
Test Accuracy: 0.983
Test Precision: 0.984
Test Recall: 0.983
Test F1 Score: 0.983

Classification Report:
               precision    recall  f1-score   support

 14-001-1296       1.00      0.90      0.95        30
 14-001-1300       1.00      1.00      1.00        30
 14-001-1353       0.97      1.00      0.98        30
 14-001-1496       1.00      0.93      0.97        30
 14-001-1504       0.94      1.00      0.97        30
 14-001-1516       1.00      1.00      1.00        30
 14-001-1529       1.00      1.00      1.00        30
 14-001-1572       0.97      0.93      0.95        30
 14-001-1579       0.94      1.00      0.97        30
 14-001-1873       0.85      0.93      0.89        30
 14-001-1878       1.00      0.87      0.93        30
 14-001-1953       1.00      1.00      1.00        30
 14-001

Results WITH Color:

Best Parameters: {'C': 20, 'gamma': 'scale', 'kernel': 'rbf'}

Validation Accuracy: 0.976911976911977

Cross-Validation Accuracy: Mean = 0.9653690326032948, Std = 0.012287226178542632

Test Accuracy: 0.983

Test Precision: 0.984

Test Recall: 0.983

Test F1 Score: 0.983