In [1]:
from utils.dataset_download import DatasetDownloader
from utils.load_datasets import load_datasets

dataset_downloader = DatasetDownloader("../datasets")
dataset_downloader.download_all()

df = load_datasets("../datasets")
display(df.head())
display(df['From'].value_counts())
display(f"Total images loaded: {len(df)}")

Dataset AdityaJain1030/recaptcha-dataset already exists. Skipping download.
Dataset nobodyPerfecZ/recaptchav2-29k already exists. Skipping download.
Dataset cry2003/google-recaptcha-v2-images already exists. Skipping download.
Dataset mikhailma/test-dataset already exists. Skipping download.


Unnamed: 0,Image,Filename,Label,From
0,..\datasets\google-recaptcha-v2-images\images\...,00f4c717-7f51-4292-8d43-5cbe5cc1bc2d.jpg,Bicycle,google-recaptcha-v2-images
1,..\datasets\google-recaptcha-v2-images\images\...,0186fa9a-20bb-4dcf-8815-3528b0ee218d.jpg,Bicycle,google-recaptcha-v2-images
2,..\datasets\google-recaptcha-v2-images\images\...,01aa607e-b06a-41a6-bf86-b791704489f1.jpg,Bicycle,google-recaptcha-v2-images
3,..\datasets\google-recaptcha-v2-images\images\...,027d92ee-bd77-4e96-8063-e5ddd295fe4d.jpg,Bicycle,google-recaptcha-v2-images
4,..\datasets\google-recaptcha-v2-images\images\...,02b8d246-6447-4aaf-8cb8-e51e9ece56d2.jpg,Bicycle,google-recaptcha-v2-images


From
google-recaptcha-v2-images    32265
recaptchav2-29k               29568
recaptcha-dataset             11774
test-dataset                    279
Name: count, dtype: int64

'Total images loaded: 73886'

In [None]:
from pathlib import Path
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Configuration
max_images = 3000

labels = df['Label'].unique()

label_count = {label: 0 for label in labels}

updt_df_dict = {
    "Image": [],
    "Filename": [],
    "Label": [],
    "From": []
}

# Create shared images directory once (outside the loop)
shared_images_dir = Path("../yolo_dataset_shared/images")
shared_images_dir.mkdir(parents=True, exist_ok=True)

# Create symlinks to all images once
print("Creating shared images directory...")
for idx, row in df.iterrows():
    if label_count[row['Label']] < max_images:
        src = Path(row['Image']).absolute()
        unique_filename = f"{row['From']}_{row['Filename']}"
        dst = shared_images_dir / unique_filename
        
        if not dst.exists():
            os.symlink(src, dst)
        label_count[row['Label']] += 1

        updt_df_dict["Image"].append(row["Image"])
        updt_df_dict["Filename"].append(row["Filename"])
        updt_df_dict["Label"].append(row["Label"])
        updt_df_dict["From"].append(row["From"])

print(f"Shared images directory created with {len(list(shared_images_dir.iterdir()))} images")

df = pd.DataFrame(updt_df_dict)
df.to_parquet("../datasets/datasets_reduced.parquet", index=False)

Creating shared images directory...
Shared images directory created with 26773 images


TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [4]:
import pandas as pd
try:
    display(df['Label'].value_counts())
except NameError:
    df = pd.read_parquet('../datasets/datasets_reduced.parquet', engine='pyarrow')
    display(df['Label'].value_counts())

Label
Bicycle          3000
Bus              3000
Car              3000
Hydrant          3000
Crosswalk        3000
Traffic Light    3000
Other            3000
Palm             2580
Bridge           1831
Stair             644
Chimney           389
Motorcycle        297
Mountain           32
Name: count, dtype: int64

In [7]:
from pathlib import Path
import os
import shutil
import pandas as pd
from sklearn.model_selection import StratifiedKFold

shared_images_dir = Path("../yolo_dataset_shared/images")

N_FOLDS = 5
# Setup k-fold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
labels = df['Label'].unique()

# Store results from all folds
all_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['Label'])):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{N_FOLDS}")
    print(f"{'='*50}")
    
    # Split data for this fold
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)
    
    print(f"Train: {len(train_df)}, Val: {len(val_df)}")
    print(f"Train label distribution:\n{train_df['Label'].value_counts()}")
    print(f"Val label distribution:\n{val_df['Label'].value_counts()}")
    
    # Create fold-specific dataset structure
    dataset_root = Path(f"../yolo_dataset_fold{fold}")
    labels_file = dataset_root / "labels.txt"
    
    dataset_root.mkdir(parents=True, exist_ok=True)
    
    # Build list of images with their splits for this fold
    all_images = []
    
    # Add training images
    for idx, row in train_df.iterrows():
        unique_filename = f"{row['From']}_{row['Filename']}"
        all_images.append({
            'filename': unique_filename,
            'label': row['Label'],
            'split': 'train'
        })
    
    # Add validation images
    for idx, row in val_df.iterrows():
        unique_filename = f"{row['From']}_{row['Filename']}"
        all_images.append({
            'filename': unique_filename,
            'label': row['Label'],
            'split': 'val'
        })
    
    # Create labels file with format: filename label split
    with open(labels_file, 'w') as f:
        f.write("filename\tlabel\tsplit\n")
        for img_info in all_images:
            f.write(f"{img_info['filename']}\t{img_info['label']}\t{img_info['split']}\n")
    
    # For YOLO classification, create train/val split directories
    # with symlinks pointing to the shared images folder
    train_dir = dataset_root / "train"
    val_dir = dataset_root / "val"
    
    # Remove old train/val if they exist
    if train_dir.exists():
        shutil.rmtree(train_dir)
    if val_dir.exists():
        shutil.rmtree(val_dir)
    
    # Create label subdirectories
    for label in labels:
        (train_dir / label).mkdir(parents=True, exist_ok=True)
        (val_dir / label).mkdir(parents=True, exist_ok=True)
    
    # Create symlinks in train/val pointing to shared images folder
    for img_info in all_images:
        src = (shared_images_dir / img_info['filename']).absolute()
        if img_info['split'] == 'train':
            dst = train_dir / img_info['label'] / img_info['filename']
        else:
            dst = val_dir / img_info['label'] / img_info['filename']
        
        os.symlink(src, dst)
    
    print(f"Fold {fold + 1} dataset created!")
    print(f"  - Labels file: {labels_file}")


FOLD 1/5
Train: 21418, Val: 5355
Train label distribution:
Label
Bicycle          2400
Bus              2400
Car              2400
Hydrant          2400
Crosswalk        2400
Traffic Light    2400
Other            2400
Palm             2064
Bridge           1464
Stair             515
Chimney           312
Motorcycle        237
Mountain           26
Name: count, dtype: int64
Val label distribution:
Label
Bicycle          600
Bus              600
Car              600
Hydrant          600
Crosswalk        600
Traffic Light    600
Other            600
Palm             516
Bridge           367
Stair            129
Chimney           77
Motorcycle        60
Mountain           6
Name: count, dtype: int64
Fold 1 dataset created!
  - Labels file: ..\yolo_dataset_fold0\labels.txt

FOLD 2/5
Train: 21418, Val: 5355
Train label distribution:
Label
Bicycle          2400
Bus              2400
Car              2400
Hydrant          2400
Crosswalk        2400
Traffic Light    2400
Other            2400

In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from ultralytics import YOLO

N_FOLDS = 5

classification_models = ["yolo11n-cls.pt", "yolo11s-cls.pt", "yolo11m-cls.pt", "yolo11l-cls.pt", "yolo11x-cls.pt"]
model_name = classification_models[0]

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)


# Store results from all folds
all_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['Label'])):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{N_FOLDS}")
    print(f"{'='*50}")

    # Create fold-specific dataset structure
    dataset_root = Path(f"../yolo_dataset_fold{fold}")
    labels_file = dataset_root / "labels.txt"
    
    # Initialize fresh model for each fold
    model = YOLO("yolo11n-cls.pt")
    model.to('cuda')
    # Train on this fold
    results = model.train(
        data=str(dataset_root),
        epochs=2,
        imgsz=224,
        batch=10,
        patience=50,
        save=True,
        project="is_recaptchav2_safe_kfold",
        name=f"fold{fold}_yolo11",
        plots=True,
        val=True,
    )
    
    # Store results
    all_results.append({
        'fold': fold + 1,
        'results': results,
        'final_metrics': results.results_dict if hasattr(results, 'results_dict') else None
    })
    
    print(f"\nFold {fold + 1} completed!")

# Summary of all folds
print(f"\n{'='*50}")
print("K-FOLD CROSS-VALIDATION SUMMARY")
print(f"{'='*50}")

# Extract and display metrics across folds
metrics_df = pd.DataFrame([
    {
        'Fold': r['fold'],
        # Add relevant metrics from results here
        # e.g., 'Accuracy': r['results'].top1 if hasattr(r['results'], 'top1') else None
    }
    for r in all_results
])

display(metrics_df)

# Calculate average performance
print("\nAverage Performance Across Folds:")
for col in metrics_df.columns:
    if col != 'Fold' and pd.api.types.is_numeric_dtype(metrics_df[col]):
        mean_val = metrics_df[col].mean()
        std_val = metrics_df[col].std()
        print(f"{col}: {mean_val:.4f} ± {std_val:.4f}")

print("\nAll folds completed! Check 'is_recaptchav2_safe_kfold/' for individual fold results.")


FOLD 1/5
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=10, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=..\yolo_dataset_fold0, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=2, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=224, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n-cls.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=fold0_yolo1110, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=50, perspective=0.0, plots=True, pose=12.0, pretrained=True, profile=False, project=is_recaptch