In [None]:
#!/usr/bin/env python3
# Surfer Detection - Training and Evaluation Pipeline
# This script trains YOLOv8 on a folder of labeled surfer images and evaluates performance

import os
import yaml
import shutil
from pathlib import Path
import random
import matplotlib.pyplot as plt
from ultralytics import YOLO
from datetime import datetime

class SurferDetectionTrainer:
    def __init__(self, 
                 data_dir,
                 output_dir="surfer_detection_results",
                 train_val_split=0.8,
                 test_size=0.1):
        """
        Initialize the surfer detection training pipeline
        
        Args:
            data_dir: Directory containing images and labels in YOLO format
            output_dir: Directory to save results
            train_val_split: Ratio of training to validation data
            test_size: Portion of data to use for testing
        """
        self.data_dir = Path(data_dir)
        self.output_dir = Path(output_dir)
        self.train_val_split = train_val_split
        self.test_size = test_size
        self.model_types = ["yolov8n", "yolov8s", "yolov8m"]  # Models to evaluate
        
        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Initialize directories for dataset splits
        self.dataset_dir = self.output_dir / "dataset"
        
        # Results storage
        self.results = {}
        
    def prepare_dataset(self):
        """Prepare the dataset by organizing files and creating data.yaml"""
        print("Preparing dataset...")
        
        # Create dataset directory structure
        for split in ["train", "val", "test"]:
            os.makedirs(self.dataset_dir / "images" / split, exist_ok=True)
            os.makedirs(self.dataset_dir / "labels" / split, exist_ok=True)
        
        # Get all image files
        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
        image_files = []
        for ext in image_extensions:
            image_files.extend(list(self.data_dir.glob(f"*{ext}")))
        
        # Shuffle files for random split
        random.shuffle(image_files)
        
        # Calculate split sizes
        total_files = len(image_files)
        test_count = int(total_files * self.test_size)
        train_count = int((total_files - test_count) * self.train_val_split)
        val_count = total_files - test_count - train_count
        
        print(f"Found {total_files} images. Split: {train_count} train, {val_count} validation, {test_count} test")
        
        # Function to copy files for a given split
        def copy_files_for_split(files, split):
            for img_path in files:
                # Get corresponding label file
                label_path = self.data_dir / f"{img_path.stem}.txt"
                
                # Skip if label doesn't exist
                if not label_path.exists():
                    print(f"Warning: No label found for {img_path.name}, skipping")
                    continue
                
                # Copy image and label to respective directories
                shutil.copy(img_path, self.dataset_dir / "images" / split / img_path.name)
                shutil.copy(label_path, self.dataset_dir / "labels" / split / label_path.name)
        
        # Split and copy files
        copy_files_for_split(image_files[:train_count], "train")
        copy_files_for_split(image_files[train_count:train_count+val_count], "val")
        copy_files_for_split(image_files[train_count+val_count:], "test")
        
        # Create data.yaml file
        data_yaml = {
            'path': str(self.dataset_dir.absolute()),
            'train': str(self.dataset_dir / "images" / "train"),
            'val': str(self.dataset_dir / "images" / "val"),
            'test': str(self.dataset_dir / "images" / "test"),
            'nc': 1,  # Number of classes
            'names': ['surfer']  # Class names
        }
        
        with open(self.dataset_dir / "data.yaml", 'w') as f:
            yaml.dump(data_yaml, f)
        
        print(f"Dataset prepared at {self.dataset_dir}")
        return self.dataset_dir / "data.yaml"
    
    def train_and_evaluate_models(self):
        """Train and evaluate different YOLO models"""
        print("Starting model training and evaluation...")
        data_yaml_path = self.prepare_dataset()
        
        for model_name in self.model_types:
            print(f"\n{'-'*50}")
            print(f"Training {model_name}...")
            print(f"{'-'*50}")
            
            # Create model directory
            model_dir = self.output_dir / model_name
            os.makedirs(model_dir, exist_ok=True)
            
            # Initialize model
            model = YOLO(f"{model_name}.pt")
            
            # Train model
            results = model.train(
                data=str(data_yaml_path),
                epochs=50,  # Reduced for MVP, increase for production
                patience=10,
                batch=16,
                imgsz=640,
                save=True,
                project=str(model_dir),
                name="train",
                device='0' if self.check_gpu_available() else 'cpu',
                verbose=True
            )
            
            # Validate on test set
            print(f"Validating {model_name} on test set...")
            val_results = model.val(
                data=str(data_yaml_path),
                split='test',
                project=str(model_dir),
                name="val"
            )
            
            # Store results
            self.results[model_name] = {
                'metrics': val_results.box,
                'model_path': model_dir / "train" / "weights" / "best.pt"
            }
            
            print(f"{model_name} training and validation complete")
        
        # Find the best model
        best_model = self.determine_best_model()
        return best_model
    
    def determine_best_model(self):
        """Determine the best model based on evaluation metrics"""
        print("\nEvaluating model performance...")
        
        # Compare metrics (primarily using mAP50-95)
        metrics_data = {}
        for model_name, data in self.results.items():
            metrics = data['metrics']
            metrics_data[model_name] = {
                'mAP50-95': metrics.map,
                'mAP50': metrics.map50,
                'precision': metrics.p,
                'recall': metrics.r,
                'model_path': data['model_path']
            }
            print(f"{model_name}: mAP50-95={metrics.map:.4f}, mAP50={metrics.map50:.4f}, precision={metrics.p:.4f}, recall={metrics.r:.4f}")
        
        # Find best model based on mAP50-95
        best_model = max(metrics_data.items(), key=lambda x: x[1]['mAP50-95'])
        best_model_name, best_metrics = best_model
        
        print(f"\n✅ Best model: {best_model_name} with mAP50-95={best_metrics['mAP50-95']:.4f}")
        
        # Copy best model to root of output directory
        best_model_path = best_metrics['model_path']
        best_model_copy_path = self.output_dir / "best_model.pt"
        shutil.copy(best_model_path, best_model_copy_path)
        print(f"Best model saved to {best_model_copy_path}")
        
        # Create performance comparison plot
        self.plot_model_comparison(metrics_data)
        
        return {
            'model_name': best_model_name,
            'metrics': best_metrics,
            'model_path': str(best_model_copy_path)
        }
    
    def plot_model_comparison(self, metrics_data):
        """Create a plot comparing model performance"""
        models = list(metrics_data.keys())
        map_values = [metrics_data[m]['mAP50-95'] for m in models]
        precision_values = [metrics_data[m]['precision'] for m in models]
        recall_values = [metrics_data[m]['recall'] for m in models]
        
        x = range(len(models))
        width = 0.25
        
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.bar([i - width for i in x], map_values, width, label='mAP50-95')
        ax.bar(x, precision_values, width, label='Precision')
        ax.bar([i + width for i in x], recall_values, width, label='Recall')
        
        ax.set_ylabel('Scores')
        ax.set_title('Model Performance Comparison')
        ax.set_xticks(x)
        ax.set_xticklabels(models)
        ax.legend()
        
        plt.tight_layout()
        plt.savefig(self.output_dir / "model_comparison.png")
        plt.close()
    
    def check_gpu_available(self):
        """Check if a GPU is available for training"""
        try:
            import torch
            return torch.cuda.is_available()
        except ImportError:
            return False

def main():
    """Main function to run the training pipeline"""
    print("=" * 50)
    print("SURFER DETECTION - MODEL TRAINING PIPELINE")
    print("=" * 50)
    
    # Get input directory from user
    data_dir = input("Enter the path to your dataset directory (containing images and labels): ")
    
    if not os.path.exists(data_dir):
        print(f"Error: Directory {data_dir} does not exist!")
        return
    
    # Initialize trainer
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"surfer_detection_results_{timestamp}"
    
    trainer = SurferDetectionTrainer(
        data_dir=data_dir,
        output_dir=output_dir
    )
    
    # Run training pipeline
    print("\nStarting training pipeline...")
    best_model = trainer.train_and_evaluate_models()
    
    print("\n" + "=" * 50)
    print("TRAINING PIPELINE COMPLETE")
    print("=" * 50)
    print(f"Best model: {best_model['model_name']}")
    print(f"mAP50-95: {best_model['metrics']['mAP50-95']:.4f}")
    print(f"Model saved to: {best_model['model_path']}")
    print(f"All results saved to: {output_dir}")
    print("=" * 50)

if __name__ == "__main__":
    main()