In [25]:
# Setup data paths and configuration
import os
from pathlib import Path

# Set up paths
base_path = Path("..")  # Go up one level from notebooks folder
data_path = base_path / "data"
test_csv_path = data_path / "test.csv"

print("FPFH + RBF SVM Pipeline for Tree Species Classification")
print("=" * 60)
print(f"Base path: {base_path.absolute()}")
print(f"Data path: {data_path.absolute()}")
print(f"Test CSV path: {test_csv_path.absolute()}")

# Check if paths exist
if not data_path.exists():
    print("❌ Data path not found!")
else:
    print("✓ Data path found")

if not test_csv_path.exists():
    print("❌ Test CSV not found!")
else:
    print("✓ Test CSV found")

FPFH + RBF SVM Pipeline for Tree Species Classification
Base path: /Users/ayoub/work/prjt/notebooks/..
Data path: /Users/ayoub/work/prjt/notebooks/../data
Test CSV path: /Users/ayoub/work/prjt/notebooks/../data/test.csv
✓ Data path found
✓ Test CSV found


In [26]:
import numpy as np
import sys
from pathlib import Path
import time
from tqdm import tqdm
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# Add the src directory to Python path
sys.path.append(str(Path("..").absolute()))

from src.point_cloud_processor import load_point_cloud, extract_fpfh_features

print("All libraries imported successfully!")

All libraries imported successfully!


In [27]:
def load_dataset_paths(data_path, test_csv_path):
    """Load and split dataset paths based on test.csv file"""
    try:
        test_files_df = np.genfromtxt(test_csv_path, delimiter=',', dtype=str, skip_header=1)
        test_filenames = set(test_files_df[:, 0])
    except:
        print("Warning: Could not load test.csv, using train/test folders directly")
        return load_from_folders(data_path)
    
    X_train_paths, y_train, X_test_paths, y_test = [], [], [], []

    # Check if we have train folder or species folders directly
    train_root = data_path / "train"
    if not train_root.exists():
        # Data might be directly in folders by species
        train_root = data_path

    for species_dir in train_root.iterdir():
        if not species_dir.is_dir():
            continue
        species_name = species_dir.name
        for file_path in species_dir.iterdir():
            if file_path.is_file():  # Make sure it's a file
                if file_path.name in test_filenames:
                    X_test_paths.append(file_path)
                    y_test.append(species_name)
                else:
                    X_train_paths.append(file_path)
                    y_train.append(species_name)
    
    return X_train_paths, y_train, X_test_paths, y_test

def load_from_folders(data_path):
    """Alternative method: load from existing train/test folder structure"""
    X_train_paths, y_train, X_test_paths, y_test = [], [], [], []
    
    train_root = data_path.parent / "train"  # Go back to project root
    test_root = data_path.parent / "test"
    
    if train_root.exists():
        for species_dir in train_root.iterdir():
            if species_dir.is_dir():
                species_name = species_dir.name
                for file_path in species_dir.iterdir():
                    if file_path.is_file():
                        X_train_paths.append(file_path)
                        y_train.append(species_name)
    
    if test_root.exists():
        for species_dir in test_root.iterdir():
            if species_dir.is_dir():
                species_name = species_dir.name
                for file_path in species_dir.iterdir():
                    if file_path.is_file():
                        X_test_paths.append(file_path)
                        y_test.append(species_name)
    
    return X_train_paths, y_train, X_test_paths, y_test

# Load the dataset paths
print("Loading dataset paths...")
X_train_paths, y_train, X_test_paths, y_test = load_dataset_paths(data_path, test_csv_path)

print(f"✓ Training samples: {len(X_train_paths)}")
print(f"✓ Test samples: {len(X_test_paths)}")
print(f"✓ Species found: {sorted(set(y_train))}")

if len(X_train_paths) == 0 or len(X_test_paths) == 0:
    print("❌ No data found! Please check your data structure.")

Loading dataset paths...
✓ Training samples: 557
✓ Test samples: 134
✓ Species found: ['Ash', 'Beech', 'Douglas Fir', 'Oak', 'Pine', 'Red Oak', 'Spruce']


In [28]:
VOXEL_SIZE = 0.2  

def process_files_for_features(file_paths, feature_dim=33, voxel_size=0.2):
    """Extract FPFH features from point cloud files"""
    features_list = []
    failed_files = 0
    
    # Use tqdm to create a nice progress bar
    for path in tqdm(file_paths, desc="Extracting FPFH Features"):
        try:
            pcd = load_point_cloud(path)
            if pcd and len(np.asarray(pcd.points)) > 0:
                # Extract FPFH features with provided voxel size
                features = extract_fpfh_features(pcd, voxel_size=voxel_size)
                if features is not None and len(features) == feature_dim:
                    features_list.append(features)
                else:
                    # Handle cases where features are None or wrong dimension
                    features_list.append(np.zeros(feature_dim))
                    failed_files += 1
            else:
                # Handle cases where a file fails to load
                features_list.append(np.zeros(feature_dim))
                failed_files += 1
        except Exception as e:
            print(f"Error processing {path.name}: {e}")
            features_list.append(np.zeros(feature_dim))
            failed_files += 1
    
    if failed_files > 0:
        print(f"Warning: {failed_files} files failed to process properly")
            
    return np.array(features_list)

# Process training and test data
print("Processing training data...")
X_train_features = process_files_for_features(X_train_paths, voxel_size=VOXEL_SIZE)

print("Processing testing data...")
X_test_features = process_files_for_features(X_test_paths, voxel_size=VOXEL_SIZE)

print("Feature extraction complete.")
print(f"Shape of training features: {X_train_features.shape}")
print(f"Shape of test features: {X_test_features.shape}")

# Check for any issues
if X_train_features.shape[0] == 0:
    print("❌ No training features extracted!")
if X_test_features.shape[0] == 0:
    print("❌ No test features extracted!")

Processing training data...


Extracting FPFH Features:   0%|          | 0/557 [00:00<?, ?it/s]



Extracting FPFH Features:   0%|          | 1/557 [00:00<01:41,  5.50it/s]



Extracting FPFH Features:   0%|          | 2/557 [00:00<01:32,  6.00it/s]



Extracting FPFH Features:   1%|          | 3/557 [00:00<01:29,  6.19it/s]



Extracting FPFH Features:   1%|          | 4/557 [00:00<01:20,  6.91it/s]



Extracting FPFH Features:   1%|          | 5/557 [00:00<01:17,  7.09it/s]



Extracting FPFH Features:   1%|▏         | 8/557 [00:01<01:51,  4.92it/s]



Extracting FPFH Features:   2%|▏         | 9/557 [00:01<02:05,  4.35it/s]



Extracting FPFH Features:   2%|▏         | 11/557 [00:02<02:01,  4.49it/s]



Extracting FPFH Features:   2%|▏         | 12/557 [00:02<01:44,  5.23it/s]



Extracting FPFH Features:   2%|▏         | 13/557 [00:02<01:46,  5.12it/s]



Extracting FPFH Features:   3%|▎         | 15/557 [00:02<01:33,  5.77it/s]



Extracting FPFH Features:   3%|▎         | 16/557 [00:02<01:27,  6.20it/s]



Extracting FPFH Features:   3%|▎         | 17/557 [00:03<02:04,  4.33it/s]



Extracting FPFH Features:   5%|▍         | 26/557 [00:04<00:52, 10.04it/s]



Extracting FPFH Features:   5%|▌         | 28/557 [00:04<01:04,  8.24it/s]



Extracting FPFH Features:   6%|▌         | 33/557 [00:05<00:57,  9.15it/s]



Extracting FPFH Features:   7%|▋         | 40/557 [00:05<00:36, 14.22it/s]



Extracting FPFH Features:   8%|▊         | 43/557 [00:06<00:50, 10.10it/s]



Extracting FPFH Features:   8%|▊         | 45/557 [00:06<00:51,  9.86it/s]



Extracting FPFH Features:   9%|▉         | 51/557 [00:06<00:41, 12.14it/s]



Extracting FPFH Features:  10%|█         | 56/557 [00:07<00:35, 14.06it/s]



Extracting FPFH Features:  11%|█         | 62/557 [00:07<00:27, 18.18it/s]



Extracting FPFH Features:  12%|█▏        | 65/557 [00:07<00:27, 18.10it/s]



Extracting FPFH Features:  13%|█▎        | 72/557 [00:08<00:35, 13.61it/s]



Extracting FPFH Features:  14%|█▎        | 76/557 [00:08<00:28, 16.65it/s]



Extracting FPFH Features:  14%|█▍        | 79/557 [00:08<00:27, 17.16it/s]



Extracting FPFH Features:  15%|█▌        | 86/557 [00:09<00:36, 12.99it/s]



Extracting FPFH Features:  17%|█▋        | 92/557 [00:10<00:52,  8.91it/s]



Extracting FPFH Features:  17%|█▋        | 97/557 [00:10<00:49,  9.23it/s]



Extracting FPFH Features:  18%|█▊        | 100/557 [00:10<00:41, 11.10it/s]



Extracting FPFH Features:  18%|█▊        | 102/557 [00:10<00:38, 11.93it/s]



Extracting FPFH Features:  19%|█▉        | 107/557 [00:11<00:45,  9.89it/s]



Extracting FPFH Features:  20%|██        | 114/557 [00:12<00:35, 12.54it/s]



Extracting FPFH Features:  21%|██        | 117/557 [00:12<00:30, 14.39it/s]



Extracting FPFH Features:  21%|██▏       | 119/557 [00:12<00:39, 11.03it/s]



Extracting FPFH Features:  22%|██▏       | 124/557 [00:12<00:31, 13.75it/s]



Extracting FPFH Features:  23%|██▎       | 127/557 [00:12<00:26, 16.43it/s]



Extracting FPFH Features:  24%|██▎       | 131/557 [00:13<00:40, 10.48it/s]



Extracting FPFH Features:  24%|██▍       | 135/557 [00:13<00:35, 11.79it/s]



Extracting FPFH Features:  26%|██▌       | 143/557 [00:14<00:25, 15.98it/s]



Extracting FPFH Features:  26%|██▌       | 146/557 [00:14<00:23, 17.40it/s]



Extracting FPFH Features:  27%|██▋       | 149/557 [00:14<00:27, 14.74it/s]



Extracting FPFH Features:  27%|██▋       | 152/557 [00:14<00:28, 14.20it/s]



Extracting FPFH Features:  28%|██▊       | 155/557 [00:15<00:28, 14.05it/s]



Extracting FPFH Features:  29%|██▉       | 161/557 [00:15<00:32, 12.03it/s]



Extracting FPFH Features:  30%|███       | 169/557 [00:15<00:21, 18.36it/s]



Extracting FPFH Features:  31%|███       | 173/557 [00:16<00:17, 22.03it/s]



Extracting FPFH Features:  32%|███▏      | 180/557 [00:16<00:17, 22.09it/s]



Extracting FPFH Features:  33%|███▎      | 183/557 [00:16<00:19, 19.44it/s]



Extracting FPFH Features:  33%|███▎      | 186/557 [00:16<00:18, 19.55it/s]



Extracting FPFH Features:  35%|███▍      | 194/557 [00:17<00:14, 24.55it/s]



Extracting FPFH Features:  36%|███▌      | 200/557 [00:17<00:14, 24.24it/s]



Extracting FPFH Features:  37%|███▋      | 207/557 [00:17<00:13, 25.66it/s]



Extracting FPFH Features:  38%|███▊      | 210/557 [00:17<00:13, 26.19it/s]



Extracting FPFH Features:  39%|███▉      | 217/557 [00:17<00:13, 24.86it/s]



Extracting FPFH Features:  41%|████      | 226/557 [00:18<00:11, 28.08it/s]



Extracting FPFH Features:  41%|████      | 229/557 [00:18<00:13, 25.01it/s]



Extracting FPFH Features:  42%|████▏     | 236/557 [00:18<00:11, 28.16it/s]



Extracting FPFH Features:  44%|████▍     | 244/557 [00:18<00:09, 31.40it/s]



Extracting FPFH Features:  45%|████▌     | 251/557 [00:19<00:15, 20.01it/s]



Extracting FPFH Features:  46%|████▌     | 257/557 [00:19<00:13, 22.52it/s]



Extracting FPFH Features:  47%|████▋     | 260/557 [00:19<00:13, 21.45it/s]



Extracting FPFH Features:  48%|████▊     | 267/557 [00:20<00:12, 23.38it/s]



Extracting FPFH Features:  49%|████▉     | 273/557 [00:20<00:12, 22.66it/s]



Extracting FPFH Features:  50%|█████     | 279/557 [00:20<00:11, 24.56it/s]



Extracting FPFH Features:  51%|█████     | 282/557 [00:20<00:11, 24.40it/s]



Extracting FPFH Features:  52%|█████▏    | 288/557 [00:21<00:13, 20.03it/s]



Extracting FPFH Features:  52%|█████▏    | 291/557 [00:21<00:15, 17.51it/s]



Extracting FPFH Features:  53%|█████▎    | 294/557 [00:21<00:19, 13.67it/s]



Extracting FPFH Features:  54%|█████▍    | 303/557 [00:22<00:21, 11.95it/s]



Extracting FPFH Features:  55%|█████▍    | 305/557 [00:22<00:22, 11.30it/s]



Extracting FPFH Features:  56%|█████▌    | 311/557 [00:23<00:19, 12.44it/s]



Extracting FPFH Features:  57%|█████▋    | 319/557 [00:23<00:10, 22.22it/s]



Extracting FPFH Features:  58%|█████▊    | 325/557 [00:23<00:11, 19.82it/s]



Extracting FPFH Features:  62%|██████▏   | 344/557 [00:23<00:05, 37.36it/s]



Extracting FPFH Features:  63%|██████▎   | 349/557 [00:24<00:08, 23.51it/s]



Extracting FPFH Features:  63%|██████▎   | 353/557 [00:24<00:10, 18.68it/s]



Extracting FPFH Features:  64%|██████▍   | 356/557 [00:24<00:12, 16.53it/s]



Extracting FPFH Features:  64%|██████▍   | 359/557 [00:25<00:13, 14.58it/s]



Extracting FPFH Features:  65%|██████▍   | 361/557 [00:25<00:14, 13.27it/s]



Extracting FPFH Features:  66%|██████▌   | 365/557 [00:25<00:15, 12.79it/s]



Extracting FPFH Features:  66%|██████▌   | 367/557 [00:25<00:14, 12.75it/s]



Extracting FPFH Features:  67%|██████▋   | 371/557 [00:26<00:15, 11.99it/s]



Extracting FPFH Features:  67%|██████▋   | 373/557 [00:26<00:15, 11.50it/s]



Extracting FPFH Features:  68%|██████▊   | 377/557 [00:26<00:13, 13.14it/s]



Extracting FPFH Features:  68%|██████▊   | 379/557 [00:26<00:13, 13.05it/s]



Extracting FPFH Features:  69%|██████▉   | 383/557 [00:27<00:13, 12.49it/s]



Extracting FPFH Features:  69%|██████▉   | 387/557 [00:27<00:12, 13.85it/s]



Extracting FPFH Features:  70%|██████▉   | 389/557 [00:27<00:11, 14.18it/s]



Extracting FPFH Features:  71%|███████   | 393/557 [00:27<00:11, 14.84it/s]



Extracting FPFH Features:  71%|███████   | 395/557 [00:28<00:12, 12.99it/s]



Extracting FPFH Features:  72%|███████▏  | 399/557 [00:28<00:12, 12.18it/s]



Extracting FPFH Features:  72%|███████▏  | 403/557 [00:28<00:11, 13.60it/s]



Extracting FPFH Features:  73%|███████▎  | 405/557 [00:28<00:10, 14.67it/s]



Extracting FPFH Features:  73%|███████▎  | 409/557 [00:29<00:10, 13.99it/s]



Extracting FPFH Features:  74%|███████▍  | 413/557 [00:29<00:10, 13.36it/s]



Extracting FPFH Features:  75%|███████▍  | 415/557 [00:29<00:10, 13.66it/s]



Extracting FPFH Features:  75%|███████▌  | 420/557 [00:29<00:09, 14.67it/s]



Extracting FPFH Features:  76%|███████▌  | 422/557 [00:30<00:09, 14.70it/s]



Extracting FPFH Features:  77%|███████▋  | 427/557 [00:30<00:08, 15.86it/s]



Extracting FPFH Features:  78%|███████▊  | 435/557 [00:30<00:05, 22.29it/s]



Extracting FPFH Features:  81%|████████  | 449/557 [00:31<00:06, 17.65it/s]



Extracting FPFH Features:  81%|████████▏ | 453/557 [00:31<00:04, 21.88it/s]



Extracting FPFH Features:  83%|████████▎ | 463/557 [00:32<00:04, 19.57it/s]



Extracting FPFH Features:  84%|████████▍ | 469/557 [00:32<00:05, 16.35it/s]



Extracting FPFH Features:  85%|████████▍ | 471/557 [00:32<00:06, 14.17it/s]



Extracting FPFH Features:  87%|████████▋ | 483/557 [00:33<00:03, 20.34it/s]



Extracting FPFH Features:  88%|████████▊ | 490/557 [00:34<00:06, 11.13it/s]



Extracting FPFH Features:  89%|████████▉ | 498/557 [00:34<00:04, 13.65it/s]



Extracting FPFH Features:  91%|█████████▏| 509/557 [00:35<00:02, 17.48it/s]



Extracting FPFH Features:  94%|█████████▎| 521/557 [00:36<00:01, 21.17it/s]



Extracting FPFH Features:  99%|█████████▊| 549/557 [00:38<00:00, 13.38it/s]



Extracting FPFH Features: 100%|██████████| 557/557 [00:38<00:00, 14.43it/s]


Processing testing data...


Extracting FPFH Features:   0%|          | 0/134 [00:00<?, ?it/s]



Extracting FPFH Features:   1%|          | 1/134 [00:00<00:46,  2.85it/s]



Extracting FPFH Features:   2%|▏         | 3/134 [00:00<00:30,  4.26it/s]



Extracting FPFH Features:   6%|▌         | 8/134 [00:01<00:10, 11.87it/s]



Extracting FPFH Features:   9%|▉         | 12/134 [00:01<00:07, 16.42it/s]



Extracting FPFH Features:  11%|█         | 15/134 [00:01<00:10, 11.08it/s]



Extracting FPFH Features:  17%|█▋        | 23/134 [00:02<00:12,  9.11it/s]



Extracting FPFH Features:  19%|█▊        | 25/134 [00:02<00:12,  8.54it/s]



Extracting FPFH Features:  23%|██▎       | 31/134 [00:03<00:08, 11.91it/s]



Extracting FPFH Features:  29%|██▉       | 39/134 [00:03<00:05, 16.21it/s]



Extracting FPFH Features:  34%|███▍      | 46/134 [00:04<00:04, 18.04it/s]



Extracting FPFH Features:  39%|███▉      | 52/134 [00:04<00:04, 19.24it/s]



Extracting FPFH Features:  41%|████      | 55/134 [00:04<00:04, 16.71it/s]



Extracting FPFH Features:  44%|████▍     | 59/134 [00:04<00:03, 18.89it/s]



Extracting FPFH Features:  47%|████▋     | 63/134 [00:04<00:03, 22.04it/s]



Extracting FPFH Features:  49%|████▉     | 66/134 [00:05<00:04, 16.63it/s]



Extracting FPFH Features:  54%|█████▎    | 72/134 [00:05<00:03, 18.88it/s]



Extracting FPFH Features:  60%|█████▉    | 80/134 [00:05<00:02, 23.74it/s]



Extracting FPFH Features:  63%|██████▎   | 84/134 [00:05<00:01, 25.14it/s]



Extracting FPFH Features:  65%|██████▍   | 87/134 [00:06<00:02, 21.67it/s]



Extracting FPFH Features:  69%|██████▉   | 93/134 [00:06<00:02, 17.99it/s]



Extracting FPFH Features:  71%|███████   | 95/134 [00:06<00:02, 16.87it/s]



Extracting FPFH Features:  74%|███████▍  | 99/134 [00:07<00:02, 14.22it/s]



Extracting FPFH Features:  75%|███████▌  | 101/134 [00:07<00:02, 13.69it/s]



Extracting FPFH Features:  79%|███████▉  | 106/134 [00:07<00:01, 14.29it/s]



Extracting FPFH Features:  85%|████████▌ | 114/134 [00:08<00:01, 12.61it/s]



Extracting FPFH Features:  87%|████████▋ | 116/134 [00:08<00:01, 11.55it/s]



Extracting FPFH Features:  90%|█████████ | 121/134 [00:08<00:01, 12.15it/s]



Extracting FPFH Features: 100%|██████████| 134/134 [00:09<00:00, 13.93it/s]

Feature extraction complete.
Shape of training features: (557, 33)
Shape of test features: (134, 33)





In [29]:
print("Evaluating the optimized model...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

svm_classifier = SVC(kernel='rbf', C=10, gamma='scale', class_weight='balanced', random_state=42)
best_svm = svm_classifier
#X_test_scaled = scaler.transform(X_test_features)

y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

Evaluating the optimized model...


NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

Evaluating the optimized model...

          FPFH + RBF SVM Classification Results (OPTIMIZED)
Optimized Model Accuracy: 0.7910 (79.10%)
Default Model Accuracy:   0.6866 (68.66%)
Improvement: +10.45 percentage points

Number of test samples: 134
Number of species: 7
Species: Ash, Beech, Douglas Fir, Oak, Pine, Red Oak, Spruce

Detailed Classification Report (Optimized Model):
----------------------------------------------------------------------
              precision    recall  f1-score   support

         Ash       0.25      0.29      0.27         7
       Beech       0.87      0.84      0.86        32
 Douglas Fir       0.82      0.86      0.84        36
         Oak       0.67      0.50      0.57         4
        Pine       0.67      0.80      0.73         5
     Red Oak       0.68      0.79      0.73        19
      Spruce       0.96      0.81      0.88        31

    accuracy                           0.79       134
   macro avg       0.70      0.70      0.70       134
weighted

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

Evaluating the optimized model...

          FPFH + RBF SVM Classification Results (OPTIMIZED)
Optimized Model Accuracy: 0.7910 (79.10%)
Default Model Accuracy:   0.6866 (68.66%)
Improvement: +10.45 percentage points

Number of test samples: 134
Number of species: 7
Species: Ash, Beech, Douglas Fir, Oak, Pine, Red Oak, Spruce

Detailed Classification Report (Optimized Model):
----------------------------------------------------------------------
              precision    recall  f1-score   support

         Ash       0.25      0.29      0.27         7
       Beech       0.87      0.84      0.86        32
 Douglas Fir       0.82      0.86      0.84        36
         Oak       0.67      0.50      0.57         4
        Pine       0.67      0.80      0.73         5
     Red Oak       0.68      0.79      0.73        19
      Spruce       0.96      0.81      0.88        31

    accuracy                           0.79       134
   macro avg       0.70      0.70      0.70       134
weighted

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

Evaluating the optimized model...

          FPFH + RBF SVM Classification Results (OPTIMIZED)
Optimized Model Accuracy: 0.7910 (79.10%)
Default Model Accuracy:   0.6866 (68.66%)
Improvement: +10.45 percentage points

Number of test samples: 134
Number of species: 7
Species: Ash, Beech, Douglas Fir, Oak, Pine, Red Oak, Spruce

Detailed Classification Report (Optimized Model):
----------------------------------------------------------------------
              precision    recall  f1-score   support

         Ash       0.25      0.29      0.27         7
       Beech       0.87      0.84      0.86        32
 Douglas Fir       0.82      0.86      0.84        36
         Oak       0.67      0.50      0.57         4
        Pine       0.67      0.80      0.73         5
     Red Oak       0.68      0.79      0.73        19
      Spruce       0.96      0.81      0.88        31

    accuracy                           0.79       134
   macro avg       0.70      0.70      0.70       134
weighted

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

Evaluating the optimized model...

          FPFH + RBF SVM Classification Results (OPTIMIZED)
Optimized Model Accuracy: 0.7910 (79.10%)
Default Model Accuracy:   0.6866 (68.66%)
Improvement: +10.45 percentage points

Number of test samples: 134
Number of species: 7
Species: Ash, Beech, Douglas Fir, Oak, Pine, Red Oak, Spruce

Detailed Classification Report (Optimized Model):
----------------------------------------------------------------------
              precision    recall  f1-score   support

         Ash       0.25      0.29      0.27         7
       Beech       0.87      0.84      0.86        32
 Douglas Fir       0.82      0.86      0.84        36
         Oak       0.67      0.50      0.57         4
        Pine       0.67      0.80      0.73         5
     Red Oak       0.68      0.79      0.73        19
      Spruce       0.96      0.81      0.88        31

    accuracy                           0.79       134
   macro avg       0.70      0.70      0.70       134
weighted

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

In [None]:
print("Evaluating the model...")

# Pick model: use best_svm if available and fitted; else fall back to baseline svm_classifier
from sklearn.utils.validation import check_is_fitted
try:
    check_is_fitted(best_svm)
    model_for_eval = best_svm
    used_label = "Optimized"
except Exception:
    model_for_eval = svm_classifier
    used_label = "Baseline"

# Predictions
y_pred = model_for_eval.predict(X_test_scaled)
# Always compute baseline predictions for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Classes
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print(f"          FPFH + RBF SVM Classification Results ({used_label})")
print("="*70)

# Accuracies
accuracy_used = accuracy_score(y_test, y_pred)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"{used_label} Model Accuracy: {accuracy_used:.4f} ({accuracy_used*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_used - accuracy_default)*100:+.2f} percentage points")

# Report
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report ({used_label} Model):")
print("-" * 70)
report_used = classification_report(y_test, y_pred, target_names=all_classes, zero_division=0)
print(report_used)

# Confusion matrix
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Used Model):")
print("-" * 40)
cm_used = confusion_matrix(y_test, y_pred, labels=all_classes)
cm_df_used = pd.DataFrame(cm_used, index=all_classes, columns=all_classes)
print(cm_df_used)

# Per-class performance
print(f"\nPer-class Performance ({used_label} Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_used = [y_pred[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        species_accuracy_used = sum(1 for pred in species_predictions_used if pred == species) / len(species_predictions_used)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_used - species_accuracy_def
        print(f"{species:<15}: {species_accuracy_used:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

# Model parameters (print what we used)
print(f"\nBest/Baseline Model Parameters Used:")
print(f"C = {model_for_eval.C}")
print(f"gamma = {model_for_eval.gamma}")
print(f"kernel = {model_for_eval.kernel}")

print(f"\n{'='*70}")
print("Evaluation completed successfully!")

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")

In [None]:
# Crucial step for SVMs: feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features) # Use the same scaler

# Create and train the RBF SVM
print("Training the RBF SVM classifier...")

svm_classifier = SVC(kernel='rbf', C=10, gamma='scale', class_weight='balanced', random_state=42)
svm_classifier.fit(X_train_scaled, y_train)
print("Training complete.")

# Ensure best_svm exists even if GridSearchCV hasn't been run yet
best_svm = svm_classifier

In [None]:
# Voxel-size sweep with optional SMOTE balancing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

try:
    from imblearn.over_sampling import SMOTE
    HAS_SMOTE = True
except Exception:
    HAS_SMOTE = False

voxel_sizes = [0.1, 0.15, 0.2, 0.3]
use_smote = True  # toggle

results = []
for vs in voxel_sizes:
    print(f"\n=== Voxel size: {vs} ===")
    X_tr = process_files_for_features(X_train_paths, voxel_size=vs)
    X_te = process_files_for_features(X_test_paths, voxel_size=vs)

    # Optional SMOTE balancing
    if use_smote and HAS_SMOTE:
        smote = SMOTE(random_state=42)
        X_tr_bal, y_tr_bal = smote.fit_resample(X_tr, y_train)
    else:
        X_tr_bal, y_tr_bal = X_tr, y_train

    # Scale
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr_bal)
    X_te_scaled = scaler.transform(X_te)

    # Grid search focused (smaller for speed)
    param_grid = {
        'C': [1, 3, 10, 30],
        'gamma': ['scale', 0.03, 0.1, 0.3],
        'class_weight': [None, 'balanced'],
        'kernel': ['rbf']
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        SVC(random_state=42),
        param_grid,
        cv=cv,
        verbose=0,
        n_jobs=-1,
        scoring='balanced_accuracy'
    )
    grid_search.fit(X_tr_scaled, y_tr_bal)

    best_svm_vs = grid_search.best_estimator_
    y_pred = best_svm_vs.predict(X_te_scaled)
    bal_acc = balanced_accuracy_score(y_test, y_pred)

    results.append({
        'voxel_size': vs,
        'best_params': grid_search.best_params_,
        'cv_bal_acc': grid_search.best_score_,
        'test_bal_acc': bal_acc
    })
    print(f"Best params @ vs={vs}: {grid_search.best_params_}")
    print(f"CV balanced acc: {grid_search.best_score_:.4f}")
    print(f"Test balanced acc: {bal_acc:.4f}")

# Display summary
import pandas as pd
res_df = pd.DataFrame(results)
print("\nVoxel sweep results:")
print(res_df.sort_values('test_bal_acc', ascending=False))


In [None]:
# Hyperparameter Optimization with Grid Search
print("Performing hyperparameter optimization with Grid Search...")
print("=" * 60)

from sklearn.model_selection import StratifiedKFold

# Define parameter grid (tune class_weight as well)
param_grid = {
    'C': [0.1, 1, 3, 10, 30, 100],
    'gamma': ['scale', 'auto', 0.03, 0.1, 0.3, 1],
    'class_weight': [None, 'balanced'],
    'kernel': ['rbf']
}

# Use stratified CV and balanced accuracy to reward minority-class performance
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    SVC(random_state=42),
    param_grid,
    cv=cv,
    verbose=2,
    n_jobs=-1,
    scoring='balanced_accuracy'
)

# Fit the grid search
start_time = time.time()
print(f"Testing {len(param_grid['C']) * len(param_grid['gamma']) * len(param_grid['class_weight'])} parameter combinations...")
grid_search.fit(X_train_scaled, y_train)
end_time = time.time()

# Results
print(f"\nGrid Search completed in {end_time - start_time:.2f} seconds")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation balanced accuracy: {grid_search.best_score_:.4f}")

# Get the best estimator
best_svm = grid_search.best_estimator_
print(f"Best SVM model: {best_svm}")

# Show top 5 parameter combinations
print(f"\nTop 5 parameter combinations (by balanced accuracy):")
print("-" * 50)
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]
for idx, row in top_5.iterrows():
    params = row['params']
    mean_score = row['mean_test_score']
    std_score = row['std_test_score']
    print(f"C={params['C']}, gamma={params['gamma']}, class_weight={params.get('class_weight')}: {mean_score:.4f} (±{std_score:.4f})")

print(f"\n{'='*60}")
print("Hyperparameter optimization completed!")

In [None]:
print("Evaluating the optimized model...")
y_pred_optimized = best_svm.predict(X_test_scaled)

# Also evaluate the default model for comparison
y_pred_default = svm_classifier.predict(X_test_scaled)

# Get the unique class names from both train and test sets
train_classes = set(y_train)
test_classes = set(y_test)
all_classes = sorted(list(train_classes | test_classes))

print("\n" + "="*70)
print("          FPFH + RBF SVM Classification Results (OPTIMIZED)")
print("="*70)

# Compare accuracies
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Optimized Model Accuracy: {accuracy_optimized:.4f} ({accuracy_optimized*100:.2f}%)")
print(f"Default Model Accuracy:   {accuracy_default:.4f} ({accuracy_default*100:.2f}%)")
print(f"Improvement: {(accuracy_optimized - accuracy_default)*100:+.2f} percentage points")

# Detailed classification report for optimized model
print(f"\nNumber of test samples: {len(y_test)}")
print(f"Number of species: {len(all_classes)}")
print(f"Species: {', '.join(all_classes)}")

print(f"\nDetailed Classification Report (Optimized Model):")
print("-" * 70)
report_optimized = classification_report(y_test, y_pred_optimized, target_names=all_classes, zero_division=0)
print(report_optimized)

# Additional metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

print("\nConfusion Matrix (Optimized Model):")
print("-" * 40)
cm_optimized = confusion_matrix(y_test, y_pred_optimized, labels=all_classes)
cm_df_optimized = pd.DataFrame(cm_optimized, index=all_classes, columns=all_classes)
print(cm_df_optimized)

# Per-class accuracy breakdown for optimized model
print(f"\nPer-class Performance (Optimized Model):")
print("-" * 50)
for i, species in enumerate(all_classes):
    if species in set(y_test):
        # Get indices for this species in test set
        species_indices = [j for j, label in enumerate(y_test) if label == species]
        species_predictions_opt = [y_pred_optimized[j] for j in species_indices]
        species_predictions_def = [y_pred_default[j] for j in species_indices]
        
        species_accuracy_opt = sum(1 for pred in species_predictions_opt if pred == species) / len(species_predictions_opt)
        species_accuracy_def = sum(1 for pred in species_predictions_def if pred == species) / len(species_predictions_def)
        improvement = species_accuracy_opt - species_accuracy_def
        
        print(f"{species:<15}: {species_accuracy_opt:.4f} (vs {species_accuracy_def:.4f}, {improvement:+.4f}) | {len(species_indices):3d} samples")

print(f"\nBest Model Parameters:")
print(f"C = {best_svm.C}")
print(f"gamma = {best_svm.gamma}")
print(f"kernel = {best_svm.kernel}")

print(f"\n{'='*70}")
print("Optimized pipeline completed successfully!")