In [8]:
import os
import sys
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE

sys.path.append(str(Path("..").absolute()))
from src.point_cloud_processor import load_point_cloud, extract_fpfh_features

base_path = Path("..")
data_path = base_path / "data"
print("Base:", base_path.absolute())
print("Data:", data_path.absolute())


Base: /Users/ayoub/work/prjt/notebooks/..
Data: /Users/ayoub/work/prjt/notebooks/../data


In [9]:
# Dataset loading helpers (same logic as baseline)

def load_from_folders(data_path: Path):
	X_train_paths, y_train, X_test_paths, y_test = [], [], [], []
	train_root = data_path.parent / "train"
	test_root = data_path.parent / "test"
	if train_root.exists():
		for species_dir in train_root.iterdir():
			if species_dir.is_dir():
				species = species_dir.name
				for f in species_dir.iterdir():
					if f.is_file():
						X_train_paths.append(f)
						y_train.append(species)
	if test_root.exists():
		for species_dir in test_root.iterdir():
			if species_dir.is_dir():
				species = species_dir.name
				for f in species_dir.iterdir():
					if f.is_file():
						X_test_paths.append(f)
						y_test.append(species)
	return X_train_paths, y_train, X_test_paths, y_test

X_train_paths, y_train, X_test_paths, y_test = load_from_folders(data_path)
print(f"Train: {len(X_train_paths)} | Test: {len(X_test_paths)} | Classes: {sorted(set(y_train))}")


Train: 557 | Test: 134 | Classes: ['Ash', 'Beech', 'Douglas Fir', 'Oak', 'Pine', 'Red Oak', 'Spruce']


In [10]:
# Feature extraction with voxel_size as a parameter

def extract_features(file_paths, voxel_size=0.2, feature_dim=33):
	features = []
	fails = 0
	for p in tqdm(file_paths, desc=f"FPFH v={voxel_size}"):
		try:
			pcd = load_point_cloud(p)
			if pcd and len(np.asarray(pcd.points)) > 0:
				f = extract_fpfh_features(pcd, voxel_size=voxel_size)
				if f is not None and len(f) == feature_dim:
					features.append(f)
				else:
					features.append(np.zeros(feature_dim))
					fails += 1
			else:
				features.append(np.zeros(feature_dim))
				fails += 1
		except Exception as e:
			features.append(np.zeros(feature_dim))
			fails += 1
	return np.array(features), fails

# voxel_grid = [0.10, 0.15, 0.20, 0.30]
voxel_grid = [0.10]
print("Voxel candidates:", voxel_grid)



Voxel candidates: [0.1]


In [11]:
# Search over voxel_size: extract features, then SMOTE + scale, then SVM grid search
best_summary = None
summaries = []

for vox in voxel_grid:
	print("\n" + "="*60)
	print(f"Voxel size = {vox}")
	print("="*60)
	
	Xtr, fail_tr = extract_features(X_train_paths, voxel_size=vox)
	Xte, fail_te = extract_features(X_test_paths, voxel_size=vox)
	print(f"Fails: train={fail_tr}, test={fail_te}")
	
	# SMOTE to balance classes (on raw features)
	smote = SMOTE(random_state=42)
	Xtr_bal, ytr_bal = smote.fit_resample(Xtr, y_train)
	print(f"After SMOTE: {Xtr.shape} -> {Xtr_bal.shape}")
	
	# Scale after balancing
	scaler = StandardScaler()
	Xtr_scaled = scaler.fit_transform(Xtr_bal)
	Xte_scaled = scaler.transform(Xte)
	
	# Grid search emphasizing minority recall
	param_grid = {
		'C': [0.1, 1, 3, 10, 30, 100],
		'gamma': ['scale', 'auto', 0.03, 0.1, 0.3, 1],
		'class_weight': [None, 'balanced'],
		'kernel': ['rbf']
	}
	cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
	grid = GridSearchCV(SVC(random_state=42), param_grid, cv=cv, n_jobs=-1, verbose=0, scoring='balanced_accuracy')
	start = time.time()
	grid.fit(Xtr_scaled, ytr_bal)
	dur = time.time() - start
	print(f"Grid done in {dur:.1f}s | best BA={grid.best_score_:.4f} | params={grid.best_params_}")
	
	best_svm = grid.best_estimator_
	y_pred = best_svm.predict(Xte_scaled)
	acc = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred, zero_division=0)
	cm = confusion_matrix(y_test, y_pred, labels=sorted(set(y_train)))
	
	summary = {
		'voxel_size': vox,
		'fails_train': fail_tr,
		'fails_test': fail_te,
		'best_params': grid.best_params_,
		'balanced_accuracy_cv': grid.best_score_,
		'test_accuracy': acc,
		'classification_report': report,
		'confusion_matrix': cm
	}
	summaries.append(summary)
	
	# Track best by balanced accuracy CV, tiebreak by test acc
	if best_summary is None:
		best_summary = summary
	else:
		if summary['balanced_accuracy_cv'] > best_summary['balanced_accuracy_cv'] or \
		  (summary['balanced_accuracy_cv'] == best_summary['balanced_accuracy_cv'] and summary['test_accuracy'] > best_summary['test_accuracy']):
			best_summary = summary

print("\nBest setting:")
print(best_summary['voxel_size'], best_summary['best_params'])
print(f"CV balanced acc: {best_summary['balanced_accuracy_cv']:.4f}")
print(f"Test accuracy: {best_summary['test_accuracy']:.4f}")



Voxel size = 0.1


FPFH v=0.1: 100%|██████████| 557/557 [01:04<00:00,  8.68it/s]
FPFH v=0.1: 100%|██████████| 134/134 [00:16<00:00,  8.06it/s]


Fails: train=0, test=0
After SMOTE: (557, 33) -> (1029, 33)
Grid done in 2.5s | best BA=0.9543 | params={'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf'}

Best setting:
0.1 {'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf'}
CV balanced acc: 0.9543
Test accuracy: 0.8657


In [12]:
# Pretty-print final report for the best configuration
print("\n" + "="*80)
print("Best FPFH + SVM (with SMOTE) Configuration")
print("="*80)
print(f"Voxel size: {best_summary['voxel_size']}")
print(f"Best params: {best_summary['best_params']}")
print(f"CV balanced accuracy: {best_summary['balanced_accuracy_cv']:.4f}")
print(f"Test accuracy: {best_summary['test_accuracy']:.4f}")
print("\nClassification report (test):\n")
print(best_summary['classification_report'])

classes = sorted(set(y_train))
cm_df = pd.DataFrame(best_summary['confusion_matrix'], index=classes, columns=classes)
print("Confusion Matrix (test):\n")
print(cm_df)
print("\nNote: We used SMOTE to balance classes and searched voxel_size values to stabilize FPFH.")



Best FPFH + SVM (with SMOTE) Configuration
Voxel size: 0.1
Best params: {'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf'}
CV balanced accuracy: 0.9543
Test accuracy: 0.8657

Classification report (test):

              precision    recall  f1-score   support

         Ash       0.56      0.71      0.62         7
       Beech       0.93      0.88      0.90        32
 Douglas Fir       0.97      0.83      0.90        36
         Oak       0.75      0.75      0.75         4
        Pine       0.50      0.80      0.62         5
     Red Oak       0.89      0.89      0.89        19
      Spruce       0.88      0.94      0.91        31

    accuracy                           0.87       134
   macro avg       0.78      0.83      0.80       134
weighted avg       0.88      0.87      0.87       134

Confusion Matrix (test):

             Ash  Beech  Douglas Fir  Oak  Pine  Red Oak  Spruce
Ash            5      2            0    0     0        0       0
Beech          3     28 

In [15]:
print(accuracy_score(y_test, best_svm.predict(Xte_scaled)))

0.8656716417910447


In [16]:
print(best_svm)

SVC(C=10, random_state=42)
