In [2]:
# Read in images
import glob
import numpy as np
import matplotlib.pyplot as plt
import cv2
from buck.analysis.basics import ingest_images

# Your existing ingestion
fpath = "..\\images\\squared\\*_NDA.png"

def read_images_convert_to_rgb(file_pattern):
   
    file_paths = glob.glob(file_pattern)
    print(f"Found {len(file_paths)} image files")
    
    images = []
    
    for i, file_path in enumerate(file_paths):
        try:
            # Load image in color
            img = cv2.imread(file_path, cv2.IMREAD_COLOR)
            
            if img is None:
                print(f"Warning: Could not load {file_path}")
                continue
            
            # Convert BGR to RGB
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # Check if it's actually grayscale (all channels identical)
            if np.allclose(img_rgb[:,:,0], img_rgb[:,:,1]) and np.allclose(img_rgb[:,:,1], img_rgb[:,:,2]):
                print(f"Converted grayscale to RGB: {file_path}")
            
            # All images are now RGB regardless of original format
            images.append(img_rgb)
            
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue
    
    print(f"Loaded {len(images)} images, all converted to RGB")
    
    return images, file_paths[:len(images)]

images, paths = read_images_convert_to_rgb(fpath)

Found 225 image files
Converted grayscale to RGB: ..\images\squared\241205_240927_TX_3p5_NDA.png
Converted grayscale to RGB: ..\images\squared\250501_241108_PA_8p5_NDA.png
Loaded 225 images, all converted to RGB


In [3]:
# Extract / combine dates
# X_train, X_val, X_test, y_train, y_val, y_test, mapping = split_images_with_ages(images, paths)

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras

def extract_age_from_path(file_path):
    try:
        age_part = file_path.split("\\")[-1].split(".")[0].split("_")[3]
        
        # Skip invalid age parts like "xpx"
        if 'x' in age_part or len(age_part) < 2:
            return None
            
        age_float = float(age_part.replace("p", "."))
        
        # Validate reasonable deer age
        if 0.5 <= age_float <= 15.5:
            return age_float
        else:
            return None
            
    except (IndexError, ValueError):
        return None

def split_images_with_ages(images, file_paths, target_size=(224, 224), test_size=0.2, val_size=0.2, random_state=42):
    print("Extracting ages from filenames...")
    
    # Extract ages and resize images
    ages = []
    resized_images = []
    
    for i, (image, file_path) in enumerate(zip(images, file_paths)):
        try:
            age = extract_age_from_path(file_path)
            
            if age is not None:
                # Resize image to target size for uniform array
                import cv2
                resized_img = cv2.resize(image, target_size, interpolation=cv2.INTER_AREA)
                
                ages.append(age)
                resized_images.append(resized_img)
            else:
                print(f"Skipping invalid age in: {file_path}")
                
        except Exception as e:
            continue
    
    print(f"Successfully processed {len(ages)} images")
    
    # Group ages: all ages 5.5 and over become 5.5 (mature deer)
    print("Applying age grouping: ages 5.5+ -> 5.5")
    original_ages = ages.copy()
    ages_grouped = []
    
    for age in ages:
        if age >= 5.5:
            ages_grouped.append(5.5)
        else:
            ages_grouped.append(age)
    
    # Show original vs grouped distribution
    print("Original age distribution:")
    unique_original = sorted(list(set(original_ages)))
    for age in unique_original:
        count = original_ages.count(age)
        print(f"  Age {age}: {count} images")
    
    print("Grouped age distribution:")
    unique_ages = sorted(list(set(ages_grouped)))
    for age in unique_ages:
        count = ages_grouped.count(age)
        print(f"  Age {age}: {count} images")
    
    # Use grouped ages for the rest of the process
    ages = ages_grouped
    
    # Convert to numpy arrays (now all images have same size)
    X = np.array(resized_images)
    y_raw = np.array(ages)
    
    print(f"Image array shape: {X.shape}")
    print(f"Ages array shape: {y_raw.shape}")
    
    # Create label mapping (age -> class index)
    label_mapping = {age: idx for idx, age in enumerate(unique_ages)}
    reverse_mapping = {idx: age for age, idx in label_mapping.items()}
    
    print(f"Label mapping: {label_mapping}")
    
    # Convert ages to class indices
    y_indices = np.array([label_mapping[age] for age in ages])
    
    # Check class counts for stratification
    unique_classes, class_counts = np.unique(y_indices, return_counts=True)
    min_class_count = np.min(class_counts)
    
    print(f"Minimum class count: {min_class_count}")
    
    # If any class has fewer than 2 samples, we can't use stratification
    if min_class_count < 2:
        print("Warning: Some classes have only 1 sample. Cannot use stratified splitting.")
        print("Using random splitting instead.")
        
        # First split: separate test set (no stratification)
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y_indices, 
            test_size=test_size, 
            random_state=random_state
        )
        
        # Second split: separate train and validation (no stratification)
        val_size_adjusted = val_size / (1 - test_size)
        
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp,
            test_size=val_size_adjusted,
            random_state=random_state
        )
        
    else:
        print("Using stratified splitting to maintain class balance.")
        
        # First split: separate test set (with stratification)
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y_indices, 
            test_size=test_size, 
            random_state=random_state,
            stratify=y_indices
        )
        
        # Second split: separate train and validation (with stratification)
        val_size_adjusted = val_size / (1 - test_size)
        
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp,
            test_size=val_size_adjusted,
            random_state=random_state,
            stratify=y_temp
        )
    
    # Convert to one-hot encoding
    num_classes = len(unique_ages)
    y_train_onehot = keras.utils.to_categorical(y_train, num_classes)
    y_val_onehot = keras.utils.to_categorical(y_val, num_classes)
    y_test_onehot = keras.utils.to_categorical(y_test, num_classes)
    
    print(f"\nData split completed:")
    print(f"  Training set: {X_train.shape[0]} images")
    print(f"  Validation set: {X_val.shape[0]} images")
    print(f"  Test set: {X_test.shape[0]} images")
    print(f"  Number of classes: {num_classes}")
    
    # Show class distribution in each set
    print(f"\nClass distribution:")
    for split_name, y_split in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:
        print(f"  {split_name}:")
        for class_idx in range(num_classes):
            count = np.sum(y_split == class_idx)
            age = reverse_mapping[class_idx]
            print(f"    Age {age}: {count} images")
    
    return X_train, X_val, X_test, y_train_onehot, y_val_onehot, y_test_onehot, label_mapping

# Test usage:
X_train, X_val, X_test, y_train, y_val, y_test, mapping = split_images_with_ages(images, paths)

Extracting ages from filenames...
Skipping invalid age in: ..\images\squared\250522_241221_IN_xpx_NDA.png
Successfully processed 224 images
Applying age grouping: ages 5.5+ -> 5.5
Original age distribution:
  Age 1.5: 31 images
  Age 2.5: 39 images
  Age 3.5: 47 images
  Age 4.5: 54 images
  Age 5.5: 41 images
  Age 6.5: 6 images
  Age 8.5: 5 images
  Age 12.5: 1 images
Grouped age distribution:
  Age 1.5: 31 images
  Age 2.5: 39 images
  Age 3.5: 47 images
  Age 4.5: 54 images
  Age 5.5: 53 images
Image array shape: (224, 224, 224, 3)
Ages array shape: (224,)
Label mapping: {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
Minimum class count: 31
Using stratified splitting to maintain class balance.

Data split completed:
  Training set: 134 images
  Validation set: 45 images
  Test set: 45 images
  Number of classes: 5

Class distribution:
  Train:
    Age 1.5: 19 images
    Age 2.5: 23 images
    Age 3.5: 29 images
    Age 4.5: 32 images
    Age 5.5: 31 images
  Val:
    Age 1.5: 6 images
  

In [5]:
# Homogenize data

from buck.analysis.basics import homogenize_data

#augment_multiplier = 40
#X_train_pca, y_train_flat, X_test_pca, y_true, label_mapping, num_classes = homogenize_data(Xtr_og, ytr_og, Xte,yte_onehot, l_map, augment_multiplier)
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import gc

def augment_and_balance_data(X_train, y_train, target_samples_per_class=None, augmentation_factor=5):
    
    print("Starting data augmentation and class balancing...")
    print(f"Input data shape: {X_train.shape}")
    print(f"Input labels shape: {y_train.shape}")
    
    # Convert one-hot to class indices
    y_train_indices = np.argmax(y_train, axis=1)
    num_classes = y_train.shape[1]
    
    # Count samples per class
    unique_classes, class_counts = np.unique(y_train_indices, return_counts=True)
    
    print("Current class distribution:")
    for class_idx, count in zip(unique_classes, class_counts):
        print(f"  Class {class_idx}: {count} samples")
    
    # Determine target samples per class
    max_count = np.max(class_counts)
    if target_samples_per_class is None:
        target_samples_per_class = max_count * augmentation_factor
    
    print(f"Target samples per class: {target_samples_per_class}")
    
    # Create augmentation generator
    datagen = ImageDataGenerator(
        rotation_range=15,           # Rotate images up to 15 degrees
        width_shift_range=0.1,       # Shift horizontally up to 10%
        height_shift_range=0.1,      # Shift vertically up to 10%
        shear_range=0.1,            # Shear transformation
        zoom_range=0.1,             # Zoom in/out up to 10%
        horizontal_flip=True,       # Random horizontal flips
        brightness_range=[0.8, 1.2], # Brightness variation
        fill_mode='nearest'         # Fill strategy for new pixels
    )
    
    # Store augmented data
    X_balanced_list = []
    y_balanced_list = []
    
    # Process each class
    for class_idx in range(num_classes):
        # Get all samples for this class
        class_mask = y_train_indices == class_idx
        X_class = X_train[class_mask]
        current_count = len(X_class)
       
        if current_count == 0:
            print(f"  Warning: No samples for class {class_idx}")
            continue
        
        # Add original samples
        X_class_augmented = list(X_class)
        
        # Calculate how many more samples we need
        samples_needed = target_samples_per_class - current_count
        
        if samples_needed > 0:
            
            # Normalize images for augmentation (0-255 -> 0-1)
            X_class_norm = X_class.astype('float32') / 255.0
            
            # Generate augmented samples
            augmented_count = 0
            batch_size = min(32, current_count)  # Process in batches
            
            while augmented_count < samples_needed:
                # How many samples to generate in this batch
                batch_samples_needed = min(batch_size, samples_needed - augmented_count)
                
                # Randomly select source images for this batch
                source_indices = np.random.choice(current_count, size=batch_samples_needed, replace=True)
                X_batch = X_class_norm[source_indices]
                
                # Generate augmented images
                aug_iter = datagen.flow(X_batch, batch_size=batch_samples_needed, shuffle=False)
                X_aug_batch = next(aug_iter)
                
                # Convert back to 0-255 range
                X_aug_batch = (X_aug_batch * 255).astype(np.uint8)
                
                # Add to our collection
                for img in X_aug_batch:
                    if augmented_count < samples_needed:
                        X_class_augmented.append(img)
                        augmented_count += 1
                
                # Progress update
                if augmented_count % 100 == 0 or augmented_count >= samples_needed:
                    print(f"    Generated {augmented_count}/{samples_needed} samples")
        
        elif samples_needed < 0:
            # Randomly downsample if we have too many
            print(f"  Downsampling from {current_count} to {target_samples_per_class}")
            indices = np.random.choice(current_count, size=target_samples_per_class, replace=False)
            X_class_augmented = [X_class[i] for i in indices]
        
        # Convert to numpy array and add to balanced dataset
        X_class_final = np.array(X_class_augmented)
        y_class_final = np.full(len(X_class_final), class_idx)
        
        X_balanced_list.append(X_class_final)
        y_balanced_list.append(y_class_final)
        
        print(f"  Final samples for class {class_idx}: {len(X_class_final)}")
        
        # Clean up memory
        del X_class_augmented, X_class_final
        gc.collect()
    
    # Combine all classes
    print("\nCombining all classes...")
    X_train_balanced = np.concatenate(X_balanced_list, axis=0)
    y_train_indices_balanced = np.concatenate(y_balanced_list, axis=0)
    
    # Convert back to one-hot encoding
    from tensorflow import keras
    y_train_balanced = keras.utils.to_categorical(y_train_indices_balanced, num_classes)
    
    # Shuffle the combined dataset
    print("Shuffling combined dataset...")
    shuffle_indices = np.random.permutation(len(X_train_balanced))
    X_train_balanced = X_train_balanced[shuffle_indices]
    y_train_balanced = y_train_balanced[shuffle_indices]
    
    print(f"\nFinal balanced dataset:")
    print(f"  Shape: {X_train_balanced.shape}")
    print(f"  Labels shape: {y_train_balanced.shape}")
    
    # Verify class balance
    final_indices = np.argmax(y_train_balanced, axis=1)
    final_unique, final_counts = np.unique(final_indices, return_counts=True)
    
    print("Final class distribution:")
    for class_idx, count in zip(final_unique, final_counts):
        print(f"  Class {class_idx}: {count} samples")
    
    # Clean up memory
    del X_balanced_list, y_balanced_list
    gc.collect()
    
    return X_train_balanced, y_train_balanced

def create_simple_augmentation(image):
    """
    Simple augmentation function for individual images
    """
    
    # Random rotation (-10 to 10 degrees)
    if np.random.random() > 0.5:
        angle = np.random.uniform(-10, 10)
        center = (image.shape[1]//2, image.shape[0]//2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        image = cv2.warpAffine(image, M, (image.shape[1], image.shape[0]))
    
    # Random horizontal flip
    if np.random.random() > 0.5:
        image = cv2.flip(image, 1)
    
    # Random brightness adjustment
    if np.random.random() > 0.5:
        brightness = np.random.uniform(0.8, 1.2)
        image = np.clip(image * brightness, 0, 255).astype(np.uint8)
    
    # Random zoom (scale between 0.9 and 1.1)
    if np.random.random() > 0.5:
        scale = np.random.uniform(0.9, 1.1)
        h, w = image.shape[:2]
        new_h, new_w = int(h * scale), int(w * scale)
        
        if scale > 1:
            # Zoom in - resize then crop center
            resized = cv2.resize(image, (new_w, new_h))
            start_x = (new_w - w) // 2
            start_y = (new_h - h) // 2
            image = resized[start_y:start_y+h, start_x:start_x+w]
        else:
            # Zoom out - resize then pad
            resized = cv2.resize(image, (new_w, new_h))
            # Create black canvas and place resized image in center
            canvas = np.zeros_like(image)
            start_x = (w - new_w) // 2
            start_y = (h - new_h) // 2
            canvas[start_y:start_y+new_h, start_x:start_x+new_w] = resized
            image = canvas
    
    return image

# Test usage:
X_train_balanced, y_train_balanced = augment_and_balance_data(X_train, y_train, target_samples_per_class=500)
print("Data augmentation and balancing functions loaded!")
print("Usage:")
print("X_train_balanced, y_train_balanced = augment_and_balance_data(X_train, y_train, target_samples_per_class=500)")

Starting data augmentation and class balancing...
Input data shape: (134, 224, 224, 3)
Input labels shape: (134, 5)
Current class distribution:
  Class 0: 19 samples
  Class 1: 23 samples
  Class 2: 29 samples
  Class 3: 32 samples
  Class 4: 31 samples
Target samples per class: 500
    Generated 481/481 samples
  Final samples for class 0: 500
    Generated 477/477 samples
  Final samples for class 1: 500
    Generated 471/471 samples
  Final samples for class 2: 500
    Generated 468/468 samples
  Final samples for class 3: 500
    Generated 469/469 samples
  Final samples for class 4: 500

Combining all classes...
Shuffling combined dataset...

Final balanced dataset:
  Shape: (2500, 224, 224, 3)
  Labels shape: (2500, 5)
Final class distribution:
  Class 0: 500 samples
  Class 1: 500 samples
  Class 2: 500 samples
  Class 3: 500 samples
  Class 4: 500 samples
Data augmentation and balancing functions loaded!
Usage:
X_train_balanced, y_train_balanced = augment_and_balance_data(X_tra

In [7]:
import numpy as np

def prepare_images_for_bagging(X_train_balanced, y_train_balanced, X_test, y_test, 
                              convert_to_grayscale=False, normalize=True):
    """
    Convert image arrays to format compatible with your bagging classifier
    
    Args:
        X_train_balanced: Training images (N, H, W, 3) 
        y_train_balanced: Training labels (one-hot encoded)
        X_test: Test images (N, H, W, 3)
        y_test: Test labels (one-hot encoded)
        convert_to_grayscale: Whether to convert RGB to grayscale first
        normalize: Whether to normalize pixel values
    
    Returns:
        X_train_flat: Flattened training features (N, features)
        y_train_flat: Integer class labels
        X_test_flat: Flattened test features (N, features)  
        y_test_flat: Integer class labels
    """
    
    print("=== PREPARING IMAGES FOR BAGGING CLASSIFIER ===")
    print(f"Input shapes: Train={X_train_balanced.shape}, Test={X_test.shape}")
    
    # Convert one-hot labels to integers
    y_train_flat = np.argmax(y_train_balanced, axis=1)
    y_test_flat = np.argmax(y_test, axis=1)
    
    print(f"Label shapes: Train={y_train_flat.shape}, Test={y_test_flat.shape}")
    print(f"Classes in training: {np.unique(y_train_flat)}")
    print(f"Classes in test: {np.unique(y_test_flat)}")
    
    # Process images
    X_train_processed = X_train_balanced.copy()
    X_test_processed = X_test.copy()
    
    # Convert to grayscale if requested (reduces features by 3x)
    if convert_to_grayscale:
        print("Converting RGB to grayscale...")
        
        # RGB to grayscale conversion (standard weights)
        X_train_gray = np.dot(X_train_processed[...,:3], [0.2989, 0.5870, 0.1140])
        X_test_gray = np.dot(X_test_processed[...,:3], [0.2989, 0.5870, 0.1140])
        
        # Add channel dimension back
        X_train_processed = np.expand_dims(X_train_gray, axis=-1)
        X_test_processed = np.expand_dims(X_test_gray, axis=-1)
        
        print(f"Grayscale shapes: Train={X_train_processed.shape}, Test={X_test_processed.shape}")
    
    # Normalize if requested
    if normalize:
        print("Normalizing pixel values...")
        X_train_processed = X_train_processed.astype('float32') / 255.0
        X_test_processed = X_test_processed.astype('float32') / 255.0
    
    # Flatten images to 1D feature vectors
    print("Flattening images to feature vectors...")
    X_train_flat = X_train_processed.reshape(X_train_processed.shape[0], -1)
    X_test_flat = X_test_processed.reshape(X_test_processed.shape[0], -1)
    
    print(f"Final shapes: Train={X_train_flat.shape}, Test={X_test_flat.shape}")
    print(f"Features per image: {X_train_flat.shape[1]}")
    
    # Calculate memory usage
    train_memory_mb = X_train_flat.nbytes / (1024**2)
    test_memory_mb = X_test_flat.nbytes / (1024**2)
    total_memory_mb = train_memory_mb + test_memory_mb
    
    print(f"Memory usage: Train={train_memory_mb:.1f}MB, Test={test_memory_mb:.1f}MB, Total={total_memory_mb:.1f}MB")
    
    # Show sample statistics
    print(f"Pixel value range: [{X_train_flat.min():.3f}, {X_train_flat.max():.3f}]")
    
    return X_train_flat, y_train_flat, X_test_flat, y_test_flat

def reduce_image_features(X_train_flat, X_test_flat, method='subsample', factor=4):
    """
    Reduce the number of features if memory/computation becomes an issue
    
    Args:
        X_train_flat, X_test_flat: Flattened image arrays
        method: 'subsample', 'pca', or 'variance'
        factor: Reduction factor
    
    Returns:
        X_train_reduced, X_test_reduced: Reduced feature arrays
    """
    
    print(f"=== REDUCING FEATURES USING {method.upper()} ===")
    print(f"Original features: {X_train_flat.shape[1]}")
    
    if method == 'subsample':
        # Simple subsampling (every Nth pixel)
        indices = np.arange(0, X_train_flat.shape[1], factor)
        X_train_reduced = X_train_flat[:, indices]
        X_test_reduced = X_test_flat[:, indices]
        print(f"Subsampled to every {factor}th pixel")
        
    elif method == 'variance':
        # Keep pixels with highest variance across training set
        variances = np.var(X_train_flat, axis=0)
        n_keep = X_train_flat.shape[1] // factor
        top_indices = np.argsort(variances)[-n_keep:]
        
        X_train_reduced = X_train_flat[:, top_indices]
        X_test_reduced = X_test_flat[:, top_indices]
        print(f"Kept {n_keep} highest variance pixels")
        
    elif method == 'pca':
        # PCA dimensionality reduction
        from sklearn.decomposition import PCA
        n_components = X_train_flat.shape[1] // factor
        n_components = min(n_components, X_train_flat.shape[0] - 1)
        
        pca = PCA(n_components=n_components, random_state=42)
        X_train_reduced = pca.fit_transform(X_train_flat)
        X_test_reduced = pca.transform(X_test_flat)
        
        explained_var = np.sum(pca.explained_variance_ratio_)
        print(f"PCA to {n_components} components, explained variance: {explained_var:.3f}")
    
    else:
        raise ValueError(f"Unknown method: {method}")
    
    print(f"Reduced features: {X_train_reduced.shape[1]} (reduction factor: {X_train_flat.shape[1] / X_train_reduced.shape[1]:.1f}x)")
    
    return X_train_reduced, X_test_reduced

# Example usage functions
def quick_bagging_test(X_train_balanced, y_train_balanced, X_test, y_test, _optimize_bagging):
    """
    Quick test with grayscale + subsampling for fast results
    """
    print("=== QUICK BAGGING TEST (GRAYSCALE + SUBSAMPLED) ===")
    
    # Convert to grayscale and flatten
    X_train_flat, y_train_flat, X_test_flat, y_test_flat = prepare_images_for_bagging(
        X_train_balanced, y_train_balanced, X_test, y_test, 
        convert_to_grayscale=True, normalize=True
    )
    
    # Reduce features for faster computation
    X_train_reduced, X_test_reduced = reduce_image_features(
        X_train_flat, X_test_flat, method='subsample', factor=4
    )
    
    # Run your bagging optimization
    print("\nRunning bagging optimization...")
    opts, ma, f1, ma_vec, f1_vec = _optimize_bagging(
        X_train_reduced, y_train_flat, X_test_reduced, y_test_flat, cycles=1
    )
    
    print(f"\nQuick test results:")
    print(f"  Accuracy: {ma:.3f} ({ma:.1%})")
    print(f"  F1 Score: {f1:.3f}")
    
    return opts, ma, f1, ma_vec, f1_vec

def full_color_bagging_test(X_train_balanced, y_train_balanced, X_test, y_test, _optimize_bagging):
    """
    Full test with color images (more features, slower but potentially better)
    """
    print("=== FULL COLOR BAGGING TEST ===")
    
    # Keep color, just flatten
    X_train_flat, y_train_flat, X_test_flat, y_test_flat = prepare_images_for_bagging(
        X_train_balanced, y_train_balanced, X_test, y_test, 
        convert_to_grayscale=False, normalize=True
    )
    
    # Check if we need to reduce features due to memory
    total_features = X_train_flat.shape[1]
    if total_features > 150000:  # ~150k features might be too much
        print(f"Large feature count ({total_features}), applying variance-based reduction...")
        X_train_reduced, X_test_reduced = reduce_image_features(
            X_train_flat, X_test_flat, method='variance', factor=2
        )
    else:
        X_train_reduced, X_test_reduced = X_train_flat, X_test_flat
    
    # Run your bagging optimization
    print("\nRunning bagging optimization...")
    opts, ma, f1, ma_vec, f1_vec = _optimize_bagging(
        X_train_reduced, y_train_flat, X_test_reduced, y_test_flat, cycles=1
    )
    
    print(f"\nFull color results:")
    print(f"  Accuracy: {ma:.3f} ({ma:.1%})")
    print(f"  F1 Score: {f1:.3f}")
    
    return opts, ma, f1, ma_vec, f1_vec

print("Image flattening functions loaded!")
print("\nUsage options:")
print("# Quick test (grayscale, subsampled):")
print("opts, ma, f1, ma_vec, f1_vec = quick_bagging_test(X_train_balanced, y_train_balanced, X_test, y_test, _optimize_bagging)")
print("\n# Full test (color, all pixels):")
print("opts, ma, f1, ma_vec, f1_vec = full_color_bagging_test(X_train_balanced, y_train_balanced, X_test, y_test, _optimize_bagging)")
print("\n# Manual control:")
print("X_train_flat, y_train_flat, X_test_flat, y_test_flat = prepare_images_for_bagging(X_train_balanced, y_train_balanced, X_test, y_test)")
print("opts, ma, f1, ma_vec, f1_vec = _optimize_bagging(X_train_flat, y_train_flat, X_test_flat, y_test_flat)")

Image flattening functions loaded!

Usage options:
# Quick test (grayscale, subsampled):
opts, ma, f1, ma_vec, f1_vec = quick_bagging_test(X_train_balanced, y_train_balanced, X_test, y_test, _optimize_bagging)

# Full test (color, all pixels):
opts, ma, f1, ma_vec, f1_vec = full_color_bagging_test(X_train_balanced, y_train_balanced, X_test, y_test, _optimize_bagging)

# Manual control:
X_train_flat, y_train_flat, X_test_flat, y_test_flat = prepare_images_for_bagging(X_train_balanced, y_train_balanced, X_test, y_test)
opts, ma, f1, ma_vec, f1_vec = _optimize_bagging(X_train_flat, y_train_flat, X_test_flat, y_test_flat)


In [10]:
from buck.classifiers.random_forest import _optimize_random_forest

# Convert your balanced data to the right format
X_train_flat, y_train_flat, X_test_flat, y_test_flat = prepare_images_for_bagging(
    X_train_balanced, y_train_balanced, X_test, y_test,
    convert_to_grayscale=True,  # Reduces features 3x
    normalize=True
)

# Now this will work with your bagging function
results = _optimize_bagging(X_train_flat, y_train_flat, X_test_flat, y_test_flat, cycles=1)

=== PREPARING IMAGES FOR BAGGING CLASSIFIER ===
Input shapes: Train=(2500, 224, 224, 3), Test=(45, 224, 224, 3)
Label shapes: Train=(2500,), Test=(45,)
Classes in training: [0 1 2 3 4]
Classes in test: [0 1 2 3 4]
Converting RGB to grayscale...
Grayscale shapes: Train=(2500, 224, 224, 1), Test=(45, 224, 224, 1)
Normalizing pixel values...
Flattening images to feature vectors...
Final shapes: Train=(2500, 50176), Test=(45, 50176)
Features per image: 50176
Memory usage: Train=478.5MB, Test=8.6MB, Total=487.1MB
Pixel value range: [0.000, 1.000]



KeyboardInterrupt



## Homogenize data across classes

## Optimize all classifier models

In [None]:
# AdaBoost
'''
from buck.classifiers.ada_boost import (_optimize_rs, _optimize_nest, _optimize_lr)

# Shorten parameters
Xtr_pca = X_train_pca
ytr_flat = y_train_flat
Xte_pca = X_test_pca

# Define optimals
opts = {
    "random_state": None,
    "estimator": None,
    "n_estimators": 50,
    "learning_rate": 1.0,
}

# Adaboost
opts, ma, ab_rs = _optimize_rs(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
opts, ma, ab_ne = _optimize_nest(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
opts, ma, ab_lr = _optimize_lr(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
print(ma)
'''

In [None]:
# Random Forest
from buck.classifiers.random_forest import (
    _optimize_rs, _optimize_nest, _optimize_max_d, _optimize_crit, _optimize_cw, _optimize_mss, _optimize_msl, _optimize_mwfl, _optimize_mf, _optimize_mln, _optimize_mid
)

# Shorten parameters
Xtr_pca = X_train_pca
ytr_flat = y_train_flat
Xte_pca = X_test_pca

opts = {
    "n_estimators": 100,
    "criterion": "gini",
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "min_weight_fraction_leaf": 0.0,
    "max_features": "sqrt",
    "max_leaf_nodes": None,
    "min_impurity_decrease": 0.0,
    "bootstrap": True,
    "oob_score": False,
    "n_jobs": -1,
    "random_state": 42,
    "verbose": 0,
    "warm_start": False,
    "class_weight": None,
    "ccp_alpha": 0.0,
    "max_samples": None,
    "monotonic_cst": None,
}

# Optimize hyperparameters
ma_vec = []
f1_vec = []
for c in np.arange(10):
    opts, _, _ = _optimize_rs(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_nest(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_max_d(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_crit(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)  # type: ignore
    opts, _, _ = _optimize_cw(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_mss(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_msl(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_mwfl(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_mf(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, _, _ = _optimize_mln(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    opts, ma, f1 = _optimize_mid(Xtr_pca, ytr_flat, Xte_pca, y_true, opts)
    ma_vec.append(ma)
    f1_vec.append(f1)
    print(ma, f1)

#Best: 0.7894736842105263
#{'n_estimators': np.int64(127),
# 'criterion': 'gini',
# 'max_depth': None,
# 'min_samples_split': np.int64(2),
# 'min_samples_leaf': np.int64(1),
# 'min_weight_fraction_leaf': np.float64(0.0),
# 'max_features': 'log2',
# 'max_leaf_nodes': None,
# 'min_impurity_decrease': np.float64(0.0),
# 'bootstrap': True,
# 'oob_score': False,
# 'n_jobs': -1,
# 'random_state': np.int64(405),
# 'verbose': 0,
# 'warm_start': False,
# 'class_weight': None,
# 'ccp_alpha': 0.0,
# 'max_samples': None,
# 'monotonic_cst': None}

In [None]:
# Neural Net
'''
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score

# Shorten parameters
Xtr_pca = X_train_pca
ytr_flat = y_train_flat
Xte_pca = X_test_pca

# Define optimals
opts = {
    "hidden_layer_sizes": (100,),
    "activation": "relu",
    "solver": "adam",
    "alpha": 0.0001,
    "batch_size": "auto",
    "learning_rate": "constant",
    "learning_rate_init": 0.001,
    "power_t": 0.5,
    "max_iter": 20000,
    "shuffle": True,
    "random_state": None,
    "tol": 0.01,
    "verbose": False,
    "warm_start": False,
    "momentum": 0.9,
    "nesterovs_momentum": True,
    "early_stopping": False,
    "validation_fraction": 0.1,
    "beta_1": 0.9,
    "beta_2": 0.999,
    "epsilon": 1e-08,
    "n_iter_no_change": 10,
    "max_fun": 15000,
}

# Initialize variables
ac_vec = []
f1_vec = []
max_acc = -np.inf
max_idx = -1
variable_array = np.arange(150)
#best_val = variable_array[0]
for i in np.arange(len(variable_array)):
    v = variable_array[i]
    # Define classifiers to test
    classifier = MLPClassifier(
        random_state=v,
        hidden_layer_sizes=opts["hidden_layer_sizes"],
        activation=opts["activation"],
        solver=opts["solver"],
        alpha=opts["alpha"],
        batch_size=opts["batch_size"],
        learning_rate=opts["learning_rate"],
        learning_rate_init=opts["learning_rate_init"],
        power_t=opts["power_t"],
        max_iter=opts["max_iter"],
        shuffle=opts["shuffle"],
        tol=opts["tol"],
        verbose=opts["verbose"],
        warm_start=opts["warm_start"],
        momentum=opts["momentum"],
        nesterovs_momentum=opts["nesterovs_momentum"],
        early_stopping=opts["early_stopping"],
        validation_fraction=opts["validation_fraction"],
        beta_1=opts["beta_1"],
        beta_2=opts["beta_2"],
        epsilon=opts["epsilon"],
        n_iter_no_change=opts["n_iter_no_change"],
        max_fun=opts["max_fun"],
    )
    # Train the classifier
    classifier.fit(X_train_pca, y_train_flat)
    # Make predictions
    y_pred = classifier.predict(X_test_pca)
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    ac_vec.append(accuracy)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    f1_vec.append(f1)
    
    # Return index
    if accuracy >= max_acc:
        max_acc = accuracy
        print(max_acc)
        best_val = v
    
    # Store best value
    opts["random_state"] = best_val
'''

In [None]:
#from buck.classifiers.compare_models import compare_models
#
## Shorten parameters
#Xtr_pca = X_train_pca
#ytr_flat = y_train_flat
#Xte_pca = X_test_pca
#
#compare_models(Xtr_pca, ytr_flat, Xte_pca, y_true, num_classes, label_mapping)