In [None]:
from pathlib import Path
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [None]:
dataset_path = Path("../datasets")
shared_images_dir = Path("../dataset_shared/images")

max_images = 3000
N_FOLDS = 5

classification_models8 = ["yolov8n-cls.pt", "yolov8s-cls.pt", "yolov8m-cls.pt", "yolov8l-cls.pt", "yolov8x-cls.pt"]
classification_models11 = ["yolo11n-cls.pt", "yolo11s-cls.pt", "yolo11m-cls.pt", "yolo11l-cls.pt", "yolo11x-cls.pt"]
model_name = classification_models11[0]

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

params = {
    "epochs": 1,
    "imgsz": 224,
    "batch": 10,
    "patience": 50,
}

In [None]:
from utils.dataset_download import DatasetDownloader
from utils.load_datasets import load_datasets

dataset_downloader = DatasetDownloader(dataset_path)
dataset_downloader.download_all()

df = load_datasets(dataset_path)
display(df.head())
display(df['From'].value_counts())
display(f"Total images loaded: {len(df)}")

In [None]:
labels = df['Label'].unique()

label_count = {label: 0 for label in labels}

updt_df_dict = {
    "Image": [],
    "Filename": [],
    "Label": [],
    "From": []
}

# Create shared images directory once (outside the loop)
shared_images_dir.mkdir(parents=True, exist_ok=True)

# Create symlinks to all images once
print("Creating shared images directory...")
for idx, row in df.iterrows():
    if label_count[row['Label']] < max_images:
        src = Path(row['Image']).absolute()
        unique_filename = f"{row['From']}_{row['Filename']}"
        dst = shared_images_dir / unique_filename
        
        if not dst.exists():
            os.symlink(src, dst)
        label_count[row['Label']] += 1

        updt_df_dict["Image"].append(row["Image"])
        updt_df_dict["Filename"].append(row["Filename"])
        updt_df_dict["Label"].append(row["Label"])
        updt_df_dict["From"].append(row["From"])

print(f"Shared images directory created with {len(list(shared_images_dir.iterdir()))} images")

df = pd.DataFrame(updt_df_dict)
df.to_parquet(dataset_path / "datasets_reduced.parquet", index=False)

In [None]:
try:
    display(df['Label'].value_counts())
except NameError:
    df = pd.read_parquet(os.path.join(dataset_path, 'datasets_reduced.parquet'), engine='pyarrow')
    display(df['Label'].value_counts())

In [None]:
import shutil

# Setup k-fold
labels = df['Label'].unique()

# Store results from all folds
all_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['Label'])):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{N_FOLDS}")
    print(f"{'='*50}")
    
    # Split data for this fold
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)
    
    print(f"Train: {len(train_df)}, Val: {len(val_df)}")
    print(f"Train label distribution:\n{train_df['Label'].value_counts()}")
    print(f"Val label distribution:/n{val_df['Label'].value_counts()}")
    
    # Create fold-specific dataset structure
    dataset_root = Path(f"../dataset_fold{fold}")
    labels_file = dataset_root / "labels.txt"
    
    dataset_root.mkdir(parents=True, exist_ok=True)
    
    # Build list of images with their splits for this fold
    all_images = []
    
    # Add training images
    for idx, row in train_df.iterrows():
        unique_filename = f"{row['From']}_{row['Filename']}"
        all_images.append({
            'filename': unique_filename,
            'label': row['Label'],
            'split': 'train'
        })
    
    # Add validation images
    for idx, row in val_df.iterrows():
        unique_filename = f"{row['From']}_{row['Filename']}"
        all_images.append({
            'filename': unique_filename,
            'label': row['Label'],
            'split': 'val'
        })
    
    # Create labels file with format: filename label split
    with open(labels_file, 'w') as f:
        f.write("filename/tlabel\tsplit\n")
        for img_info in all_images:
            f.write(f"{img_info['filename']}\t{img_info['label']}\t{img_info['split']}\n")
    
    # For YOLO classification, create train/val split directories
    # with symlinks pointing to the shared images folder
    train_dir = dataset_root / "train"
    val_dir = dataset_root / "val"
    
    # Remove old train/val if they exist
    if train_dir.exists():
        shutil.rmtree(train_dir)
    if val_dir.exists():
        shutil.rmtree(val_dir)
    
    # Create label subdirectories
    for label in labels:
        (train_dir / label).mkdir(parents=True, exist_ok=True)
        (val_dir / label).mkdir(parents=True, exist_ok=True)
    
    # Create symlinks in train/val pointing to shared images folder
    for img_info in all_images:
        src = (shared_images_dir / img_info['filename']).absolute()
        if img_info['split'] == 'train':
            dst = train_dir / img_info['label'] / img_info['filename']
        else:
            dst = val_dir / img_info['label'] / img_info['filename']
        
        os.symlink(src, dst)
    
    print(f"Fold {fold + 1} dataset created!")
    print(f"  - Labels file: {labels_file}")

In [None]:
from ultralytics import YOLO

# Store results from all folds
all_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['Label'])):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{N_FOLDS}")
    print(f"{'='*50}")
    
    # Create fold-specific dataset structure
    dataset_root = Path(f"../dataset_fold{fold}")
    labels_file = dataset_root / "labels.txt"

    project = "is_recaptchav2_safe/yolo"
    name = f"{model_name[:-3]}/fold{fold}"
    fold_save_dir = os.path.join(".", project, name)

    # Initialize fresh model for each fold
    model = YOLO(model_name)
    model.to('cuda')
    
    # Train on this fold
    results = model.train(
        data=str(dataset_root),
        epochs=params['epochs'],
        imgsz=params['imgsz'],
        batch=params['batch'],
        patience=params['patience'],
        save=True,
        project=project,
        name=name,
        plots=True,
        val=True,
    )
    
    # Store results
    all_results.append({
        'fold': fold + 1,
        'results': results,
        'final_metrics': results.results_dict if hasattr(results, 'results_dict') else None
    })

    history = pd.read_csv(os.path.join(fold_save_dir, "results.csv"))
    # Plot training curves for this fold
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    # Plot loss
    axes[0].plot(history['train/loss'], label='Train Loss', marker='o')
    axes[0].plot(history['val/loss'], label='Val Loss', marker='s')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title(f'Fold {fold + 1} - Training and Validation Loss')
    axes[0].legend()
    axes[0].grid(True)

    # Plot accuracy
    axes[1].plot(history['metrics/accuracy_top1'], label='Top 1 Acc', marker='o')
    axes[1].plot(history['metrics/accuracy_top5'], label='Top 5 Acc', marker='s')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy (%)')
    axes[1].set_title(f'Fold {fold + 1} - Validation Accuracy')
    axes[1].legend()
    axes[1].grid(True)

    # Plot learning rate
    axes[2].plot(history['lr/pg0'], label='Learning Rate', marker='o', color='green')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('Learning Rate')
    axes[2].set_title(f'Fold {fold + 1} - Learning Rate Schedule')
    axes[2].set_yscale('log')
    axes[2].legend()
    axes[2].grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(fold_save_dir, 'training_plots.png'), dpi=300, bbox_inches='tight')

    print(f"Training plots saved to: {os.path.join(fold_save_dir, 'training_plots.png')}")

# Summary of all folds
print(f"\n{'='*50}")
print("K-FOLD CROSS-VALIDATION SUMMARY")
print(f"{'='*50}")

# Extract and display metrics across folds
metrics_df = pd.DataFrame([
    {
        'Fold': r['fold'],
        'Accuracy': r['results'].top1 if hasattr(r['results'], 'top1') else None,
        'Top5': r['results'].top5 if hasattr(r['results'], 'top5') else None,
        'Fitness': r['results'].fitness if hasattr(r['results'], 'fitness') else None
    }
    for r in all_results
])

display(metrics_df)

# Calculate average performance
print("\nAverage Performance Across Folds:")
for col in metrics_df.columns:
    if col != 'Fold' and pd.api.types.is_numeric_dtype(metrics_df[col]):
        mean_val = metrics_df[col].mean()
        std_val = metrics_df[col].std()
        print(f"{col}: {mean_val:.4f} ± {std_val:.4f}")

for r in all_results:
    display(r['results'].save_dir)

metrics_df.to_csv(f"is_recaptchav2_safe_kfold/metrics_{model_name[:-3]}.csv")

with open(f"is_recaptchav2_safe_kfold/metrics_{model_name[:-3]}.txt", 'w') as f:
    f.write(f"Folds: {N_FOLDS}\n")

    f.write("\n")

    for param in params.items():
        f.write(f"{param[0]}: {param[1]}\n")
    
    f.write("\n")

    for col in metrics_df.columns:
        if col != 'Fold' and pd.api.types.is_numeric_dtype(metrics_df[col]):
            mean_val = metrics_df[col].mean()
            std_val = metrics_df[col].std()
            f.write(f"{col}: {mean_val:.4f} ± {std_val:.4f}\n")