# Tree Species Classification using LBP Features + RBF SVM
# Complete Pipeline Implementation

In [13]:
import numpy as np
import cv2
import os
from pathlib import Path
from collections import defaultdict
import re

# Scikit-learn imports
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Scikit-image imports
from skimage.feature import local_binary_pattern

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


In [14]:

def group_images_by_tree(data_dir):
    """
    Groups image files by their base tree name.
    
    Args:
        data_dir (str): Path to directory containing species folders
        
    Returns:
        dict: Nested dictionary {species: {tree_name: [list_of_image_paths]}}
    """
    tree_groups = defaultdict(lambda: defaultdict(list))
    data_path = Path(data_dir)
    
    if not data_path.exists():
        print(f"Warning: Directory {data_dir} does not exist!")
        return {}
    
    # Iterate through each species folder
    for species_folder in data_path.iterdir():
        if not species_folder.is_dir():
            continue
            
        species_name = species_folder.name
        print(f"Processing species: {species_name}")
        
        # Get all image files in the species folder
        image_files = []
        for ext in ['*.png', '*.jpg', '*.jpeg']:
            image_files.extend(species_folder.glob(ext))
        
        # Group images by tree name
        for image_path in image_files:
            # Extract tree name from filename (everything before the first underscore followed by view info)
            filename = image_path.stem
            
            # Pattern to match: tree_XXX_viewinfo
            # We want to extract "tree_XXX" as the base name
            match = re.match(r'(tree_\d+)', filename)
            if match:
                tree_name = match.group(1)
                tree_groups[species_name][tree_name].append(str(image_path))
            else:
                print(f"Warning: Could not extract tree name from {filename}")
    
    # Print summary
    total_trees = 0
    total_images = 0
    for species, trees in tree_groups.items():
        num_trees = len(trees)
        num_images = sum(len(images) for images in trees.values())
        total_trees += num_trees
        total_images += num_images
        print(f"  {species}: {num_trees} trees, {num_images} images")
    
    print(f"Total: {total_trees} trees, {total_images} images")
    return dict(tree_groups)


In [15]:

# =============================================================================
# CELL 2: LBP Feature Extraction Function
# =============================================================================

def extract_lbp_features(image_path, P=24, R=3, method='uniform'):
    """
    Extract LBP features from a single image.
    
    Args:
        image_path (str): Path to the image file
        P (int): Number of circularly symmetric neighbor points
        R (int): Radius of circle
        method (str): Method for LBP computation
        
    Returns:
        np.ndarray: Normalized LBP histogram (feature vector)
    """
    try:
        # Load image in grayscale
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        
        if image is None:
            print(f"Error: Could not load image {image_path}")
            return None
        
        # Compute LBP
        lbp = local_binary_pattern(image, P, R, method=method)
        
        # Calculate histogram
        if method == 'uniform':
            # For uniform LBPs, we have P+2 bins (uniform patterns + non-uniform)
            n_bins = P + 2
        else:
            # For other methods, we have 2^P possible patterns
            n_bins = 2 ** P
        
        hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
        
        # Normalize histogram
        hist = hist.astype(float)
        hist /= (hist.sum() + 1e-10)  # Add small epsilon to avoid division by zero
        
        return hist
        
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None


In [16]:
# =============================================================================
# CELL 3: Function to Process All Images and Create Feature Vectors
# =============================================================================

def create_feature_vectors(tree_groups, P=24, R=3, method='uniform'):
    """
    Create feature vectors by combining LBP features from multiple views of each tree.
    
    Args:
        tree_groups (dict): Dictionary from group_images_by_tree function
        P, R, method: LBP parameters
        
    Returns:
        tuple: (X, y, tree_names) where X is feature matrix, y is labels, tree_names is list of identifiers
    """
    X = []
    y = []
    tree_names = []
    
    print("Extracting features from images...")
    
    for species_name, trees in tree_groups.items():
        print(f"\nProcessing {species_name}...")
        
        for tree_name, image_paths in tqdm(trees.items(), desc=f"{species_name}"):
            # Extract LBP features from all views of this tree
            tree_features = []
            
            for image_path in image_paths:
                lbp_hist = extract_lbp_features(image_path, P, R, method)
                
                if lbp_hist is not None:
                    tree_features.append(lbp_hist)
                else:
                    print(f"Skipping {image_path} due to extraction error")
            
            # If we successfully extracted features from at least one image
            if tree_features:
                # Average the histograms across all views
                averaged_features = np.mean(tree_features, axis=0)
                
                X.append(averaged_features)
                y.append(species_name)
                tree_names.append(f"{species_name}_{tree_name}")
            else:
                print(f"Warning: No features extracted for {species_name}_{tree_name}")
    
    return np.array(X), np.array(y), tree_names


In [17]:
# =============================================================================
# CELL 4: Main Processing - Load and Prepare Data
# =============================================================================

# Set your data paths
TRAIN_DIR = "data/multi_view_images/train"
TEST_DIR = "data/multi_view_images/test"

print("=" * 60)
print("STEP 1: GROUPING IMAGES BY TREE NAME")
print("=" * 60)

# Group training images
print("\nGrouping training images:")
train_groups = group_images_by_tree(TRAIN_DIR)

# Group test images
print("\nGrouping test images:")
test_groups = group_images_by_tree(TEST_DIR)

print("\n" + "=" * 60)
print("STEP 2: EXTRACTING LBP FEATURES")
print("=" * 60)

# Extract features for training data
print("\nExtracting training features...")
X_train, y_train, train_names = create_feature_vectors(train_groups, P=24, R=3, method='uniform')

# Extract features for test data
print("\nExtracting test features...")
X_test, y_test, test_names = create_feature_vectors(test_groups, P=24, R=3, method='uniform')

print(f"\nFeature extraction complete!")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
# print(f"Feature vector length: {X_train.shape[1]}")
print(f"Number of classes: {len(np.unique(y_train))}")
print(f"Classes: {np.unique(y_train)}")

STEP 1: GROUPING IMAGES BY TREE NAME

Grouping training images:

Grouping test images:

STEP 2: EXTRACTING LBP FEATURES

Extracting training features...
Extracting features from images...

Extracting test features...
Extracting features from images...

Feature extraction complete!
Training data shape: (0,)
Test data shape: (0,)
Number of classes: 0
Classes: []


In [18]:

# =============================================================================
# CELL 5: Data Preprocessing - Scaling and Label Encoding
# =============================================================================

print("\n" + "=" * 60)
print("STEP 3: DATA PREPROCESSING")
print("=" * 60)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print(f"Label mapping:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {class_name}: {i}")

# Scale features
print(f"\nScaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Scaling complete!")
print(f"Training features - Mean: {X_train_scaled.mean():.4f}, Std: {X_train_scaled.std():.4f}")
print(f"Test features - Mean: {X_test_scaled.mean():.4f}, Std: {X_test_scaled.std():.4f}")



STEP 3: DATA PREPROCESSING
Label mapping:

Scaling features...


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# =============================================================================
# CELL 6: Hyperparameter Tuning with GridSearchCV
# =============================================================================

print("\n" + "=" * 60)
print("STEP 4: HYPERPARAMETER TUNING")
print("=" * 60)

# Define parameter grid for RBF SVM
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

print(f"Parameter grid:")
print(f"  C: {param_grid['C']}")
print(f"  gamma: {param_grid['gamma']}")

# Create SVM classifier
svm = SVC(kernel='rbf', random_state=42)

# Create stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform grid search
print(f"\nPerforming grid search with 5-fold cross-validation...")
print(f"This may take several minutes depending on data size...")

grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,  # Use all available processors
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train_scaled, y_train_encoded)

print(f"\nGrid search complete!")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")


In [None]:

# =============================================================================
# CELL 7: Train Final Model and Evaluate
# =============================================================================

print("\n" + "=" * 60)
print("STEP 5: FINAL MODEL TRAINING AND EVALUATION")
print("=" * 60)

# Get the best model
best_svm = grid_search.best_estimator_

# Make predictions on test set
print("Making predictions on test set...")
y_pred = best_svm.predict(X_test_scaled)

# Calculate accuracy
test_accuracy = accuracy_score(y_test_encoded, y_pred)

print(f"\n" + "=" * 50)
print(f"FINAL RESULTS")
print(f"=" * 50)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Cross-validation Score: {grid_search.best_score_:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


In [None]:

# =============================================================================
# CELL 8: Detailed Evaluation - Classification Report and Confusion Matrix
# =============================================================================

print(f"\n" + "=" * 50)
print(f"DETAILED EVALUATION")
print(f"=" * 50)

# Classification report
print(f"\nClassification Report:")
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))

# Confusion matrix
print(f"\nConfusion Matrix:")
cm = confusion_matrix(y_test_encoded, y_pred)
print(cm)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
           xticklabels=label_encoder.classes_, 
           yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Tree Species Classification')
plt.xlabel('Predicted Species')
plt.ylabel('True Species')
plt.tight_layout()
plt.show()


In [None]:

# =============================================================================
# CELL 9: Additional Analysis - Per-Class Performance
# =============================================================================

print(f"\n" + "=" * 50)
print(f"PER-CLASS PERFORMANCE ANALYSIS")
print(f"=" * 50)

# Calculate per-class metrics
from sklearn.metrics import precision_score, recall_score, f1_score

precision_per_class = precision_score(y_test_encoded, y_pred, average=None)
recall_per_class = recall_score(y_test_encoded, y_pred, average=None)
f1_per_class = f1_score(y_test_encoded, y_pred, average=None)

print(f"\nPer-class performance:")
print(f"{'Species':<12} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
print("-" * 60)

for i, class_name in enumerate(label_encoder.classes_):
    support = np.sum(y_test_encoded == i)
    print(f"{class_name:<12} {precision_per_class[i]:<10.4f} {recall_per_class[i]:<10.4f} "
          f"{f1_per_class[i]:<10.4f} {support:<10}")

# Overall averages
print("-" * 60)
print(f"Macro avg    {np.mean(precision_per_class):<10.4f} {np.mean(recall_per_class):<10.4f} "
      f"{np.mean(f1_per_class):<10.4f} {len(y_test_encoded):<10}")
print(f"Accuracy: {test_accuracy:.4f}")

print(f"\n" + "=" * 50)
print(f"PIPELINE EXECUTION COMPLETE!")
print(f"=" * 50)