# Simple Streaming Shallow Learning - No PCA

This notebook uses a simple approach without PCA to avoid dimensionality issues.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import os
import sys
import pickle
from pathlib import Path
import gc
from typing import Generator, List, Tuple

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

# Add parent directory to path
sys.path.append('../..')
from ml_models_core.src.base_classifier import BaseImageClassifier
from ml_models_core.src.utils import ModelUtils

# Set random seed
np.random.seed(42)

print("Setup complete - simple streaming approach ready")

# Find dataset
dataset_paths = [
    "../../data/downloads/combined_unified_classification",
    "./data/downloads/combined_unified_classification"
]

dataset_path = None
for path in dataset_paths:
    if os.path.exists(path):
        dataset_path = path
        break

if dataset_path is None:
    raise FileNotFoundError("Dataset not found")

print(f"Using dataset: {dataset_path}")

# Get class info
class_dirs = [d for d in Path(dataset_path).iterdir() if d.is_dir() and not d.name.startswith('.')]
class_names = sorted([d.name for d in class_dirs])
class_to_idx = {name: idx for idx, name in enumerate(class_names)}

print(f"Found {len(class_names)} classes")
print(f"First 5 classes: {class_names[:5]}")
print(f"Last 5 classes: {class_names[-5:]}")

print("Dataset path and class mapping defined successfully!")

In [None]:
# This cell requires that cell 1 has been run first to define dataset_path and class_to_idx

def extract_simple_features(image):
    """Extract simple features from a single image."""
    features = []
    
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    
    # RGB channel statistics (mean, std per channel)
    for channel in range(3):
        channel_data = image[:, :, channel].flatten()
        features.extend([
            np.mean(channel_data),
            np.std(channel_data)
        ])
    
    # Grayscale statistics
    gray_flat = gray.flatten()
    features.extend([
        np.mean(gray_flat),
        np.std(gray_flat)
    ])
    
    # Simple histogram (4 bins per channel to keep features low)
    for channel in range(3):
        hist, _ = np.histogram(image[:, :, channel], bins=4, range=(0, 256))
        hist = hist / (np.sum(hist) + 1e-8)
        features.extend(hist)
    
    # Edge density
    edges = cv2.Canny(gray, 50, 150)
    features.append(np.sum(edges > 0) / edges.size)
    
    return np.array(features)

# Test feature extraction
def test_features():
    # Find first image
    for class_dir in Path(dataset_path).iterdir():
        if class_dir.is_dir() and not class_dir.name.startswith('.'):
            for img_path in class_dir.iterdir():
                if img_path.suffix.lower() in {'.jpg', '.jpeg', '.png'}:
                    # Load and test
                    img = Image.open(img_path).convert('RGB')
                    img = img.resize((64, 64))
                    img_array = np.array(img)
                    
                    features = extract_simple_features(img_array)
                    print(f"Feature vector length: {len(features)}")
                    print(f"Feature breakdown:")
                    print(f"  RGB stats: 6 (2 per channel)")
                    print(f"  Gray stats: 2")
                    print(f"  Histograms: 12 (4 bins x 3 channels)")
                    print(f"  Edge density: 1")
                    print(f"  Total expected: 21")
                    print(f"  Actual: {len(features)}")
                    return

test_features()

def process_dataset_streaming(batch_size=50, max_images_per_class=None):
    """Process dataset in streaming fashion. If max_images_per_class=None, use all images."""
    all_features = []
    all_labels = []
    
    valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
    
    batch_images = []
    batch_labels = []
    total_processed = 0
    
    if max_images_per_class is None:
        print("Processing FULL dataset (no image limit per class)...")
    else:
        print(f"Processing dataset with max {max_images_per_class} images per class...")
    
    for class_dir in Path(dataset_path).iterdir():
        if not class_dir.is_dir() or class_dir.name.startswith('.'):
            continue
            
        class_name = class_dir.name
        class_idx = class_to_idx[class_name]
        class_count = 0
        
        print(f"Processing class: {class_name}")
        
        for img_path in class_dir.iterdir():
            if img_path.suffix.lower() not in valid_extensions:
                continue
                
            # Only check limit if it's specified
            if max_images_per_class is not None and class_count >= max_images_per_class:
                break
                
            try:
                # Load image
                img = Image.open(img_path).convert('RGB')
                img = img.resize((64, 64))
                img_array = np.array(img, dtype=np.uint8)
                
                batch_images.append(img_array)
                batch_labels.append(class_idx)
                class_count += 1
                total_processed += 1
                
                # Process batch when full
                if len(batch_images) >= batch_size:
                    print(f"  Processing batch of {len(batch_images)} images... (Total so far: {total_processed})")
                    
                    # Extract features
                    batch_features = []
                    for img in batch_images:
                        features = extract_simple_features(img)
                        batch_features.append(features)
                    
                    all_features.extend(batch_features)
                    all_labels.extend(batch_labels)
                    
                    # Clear batch
                    batch_images = []
                    batch_labels = []
                    gc.collect()
                    
            except Exception as e:
                print(f"    Error loading {img_path}: {e}")
                continue
        
        print(f"  Completed {class_name}: {class_count} images")
    
    # Process remaining batch
    if batch_images:
        print(f"Processing final batch of {len(batch_images)} images...")
        batch_features = []
        for img in batch_images:
            features = extract_simple_features(img)
            batch_features.append(features)
        
        all_features.extend(batch_features)
        all_labels.extend(batch_labels)
    
    print(f"\n=== PROCESSING COMPLETE ===")
    print(f"Total processed: {total_processed} images")
    print(f"Total classes: {len(np.unique(all_labels))}")
    return np.array(all_features), np.array(all_labels)

print("Functions defined. Ready to process dataset.")

In [None]:
# Process the dataset - NOW USING ALL DATA!
print("Starting streaming processing with FULL dataset...")
X, y = process_dataset_streaming(batch_size=50, max_images_per_class=None)  # None = no limit

print(f"\nFinal dataset shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Unique classes: {len(np.unique(y))}")

# Memory check
import psutil
process = psutil.Process(os.getpid())
memory_mb = process.memory_info().rss / 1024 / 1024
print(f"Memory usage: {memory_mb:.1f} MB")

In [None]:
# Scale features and split data
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Splitting data...")
X_temp, X_test, y_temp, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Clean up
del X, X_scaled, X_temp, y_temp
gc.collect()

print("Data scaling and splitting complete!")

In [None]:
# Train models
print("Training models...")

# SGD Classifier
sgd_model = SGDClassifier(random_state=42, max_iter=1000, alpha=0.01)
sgd_model.fit(X_train, y_train)
sgd_val_acc = accuracy_score(y_val, sgd_model.predict(X_val))
print(f"SGD Validation Accuracy: {sgd_val_acc:.4f}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_val_acc = accuracy_score(y_val, rf_model.predict(X_val))
print(f"Random Forest Validation Accuracy: {rf_val_acc:.4f}")

# Choose best model
if rf_val_acc > sgd_val_acc:
    best_model = rf_model
    best_name = "Random Forest"
    best_val_acc = rf_val_acc
else:
    best_model = sgd_model
    best_name = "SGD Classifier"
    best_val_acc = sgd_val_acc

print(f"\nBest model: {best_name} ({best_val_acc:.4f})")

# Test accuracy
test_predictions = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test accuracy: {test_accuracy:.4f}")

# Show class distribution in results
unique_test, counts_test = np.unique(y_test, return_counts=True)
print(f"\nTest set class distribution: {len(unique_test)} classes")
print(f"Images per class (mean): {np.mean(counts_test):.1f}")
print(f"Images per class (min/max): {np.min(counts_test)}/{np.max(counts_test)}")

print("Model training complete!")

In [None]:
# Simple classifier class
class SimpleShallowClassifier(BaseImageClassifier):
    def __init__(self, model_name="simple-shallow-classifier", version="1.0.0"):
        super().__init__(model_name, version)
        self.model = None
        self.scaler = None
        self.class_names = None
    
    def load_model(self, model_path: str) -> None:
        with open(model_path, 'rb') as f:
            model_data = pickle.load(f)
        self.model = model_data['model']
        self.scaler = model_data['scaler']
        self.class_names = model_data['class_names']
        self._is_loaded = True
    
    def preprocess(self, image: np.ndarray) -> np.ndarray:
        image_resized = ModelUtils.resize_image(image, (64, 64))
        if len(image_resized.shape) == 3 and image_resized.shape[2] == 4:
            image_resized = ModelUtils.convert_to_rgb(image_resized)
        if image_resized.max() <= 1.0:
            image_resized = (image_resized * 255).astype(np.uint8)
        return image_resized
    
    def predict(self, image: np.ndarray) -> dict:
        if not self.is_loaded:
            raise ValueError("Model not loaded")
        
        processed_image = self.preprocess(image)
        features = extract_simple_features(processed_image)
        features_scaled = self.scaler.transform([features])
        
        if hasattr(self.model, 'predict_proba'):
            probabilities = self.model.predict_proba(features_scaled)[0]
        else:
            prediction = self.model.predict(features_scaled)[0]
            probabilities = np.zeros(len(self.class_names))
            probabilities[prediction] = 1.0
        
        return {self.class_names[i]: float(prob) for i, prob in enumerate(probabilities)}
    
    def get_metadata(self) -> dict:
        return {
            "model_type": "simple_shallow_learning",
            "algorithm": type(self.model).__name__,
            "feature_dimensions": 21,
            "classes": self.class_names,
            "version": self.version
        }
    
    def save_model(self, model_path: str, model, scaler, class_names):
        model_data = {
            'model': model,
            'scaler': scaler,
            'class_names': class_names
        }
        with open(model_path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {model_path}")

# Save the model
model_path = "../models/simple_shallow_classifier.pkl"
os.makedirs("../models", exist_ok=True)

classifier = SimpleShallowClassifier()
classifier.save_model(model_path, best_model, scaler, class_names)

print(f"\n=== TRAINING COMPLETE ===")
print(f"Model: {best_name}")
print(f"Validation accuracy: {best_val_acc:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")
print(f"Classes: {len(class_names)}")
print(f"Features: 21")
print(f"Model saved to: {model_path}")

# Final memory check
memory_mb = process.memory_info().rss / 1024 / 1024
print(f"Final memory usage: {memory_mb:.1f} MB")