In [None]:
import json
import os
import shutil
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split


SOURCE_IMAGES_DIR = r'd:\bdd100k\bdd100k_dataset\bdd100k_images\train'
SOURCE_LABELS_DIR = r'd:\bdd100k\bdd100k_dataset\bdd100k_labels\train'
DEST_DIR = r'd:\bdd100k\yolov11_dataset'


CLASS_MAPPING = {
    'car': 0,
    'person': 1,
    'truck': 2,
    'motor': 3,
    'rider': 4,
    'bus': 5,
    'train': 6
}


IMG_WIDTH = 1280
IMG_HEIGHT = 720

## 1. Load and Filter Data
We load the JSON labels to extract attributes (Weather, Time of Day, Scene) for sampling.

In [2]:
def load_metadata(label_dir):
    """
    Load metadata from JSON files for sampling.
    Returns a DataFrame with file_name, weather, timeofday, scene.
    """
    metadata = []
    json_files = [f for f in os.listdir(label_dir) if f.endswith('.json')]
    
    print(f"Scanning {len(json_files)} files in {label_dir}...")
    
    for filename in tqdm(json_files):
        filepath = os.path.join(label_dir, filename)
        try:
            with open(filepath, 'r') as f:
                data = json.load(f)
                
                # BDD100K json structure check
                attrs = data.get('attributes', {})
                
                # Handle image name: ensure it has .jpg extension
                image_name = data.get('name', filename.replace('.json', '.jpg'))
                if not image_name.endswith('.jpg'):
                    image_name += '.jpg'
                    
                metadata.append({
                    'file_name': filename, # JSON filename
                    'image_name': image_name,
                    'weather': attrs.get('weather', 'unknown'),
                    'timeofday': attrs.get('timeofday', 'unknown'),
                    'scene': attrs.get('scene', 'unknown')
                })
        except Exception as e:
            print(f"Error reading {filename}: {e}")
            
    return pd.DataFrame(metadata)

# Load metadata
df_metadata = load_metadata(SOURCE_LABELS_DIR)
print(f"Loaded metadata for {len(df_metadata)} images.")
print(df_metadata.head())

Scanning 70000 files in d:\bdd100k\bdd100k_dataset\bdd100k_labels\train...


100%|██████████| 70000/70000 [13:45<00:00, 84.83it/s] 


Loaded metadata for 70000 images.
                file_name             image_name weather  timeofday  \
0  0000f77c-6257be58.json  0000f77c-6257be58.jpg   clear    daytime   
1  0000f77c-62c2a288.json  0000f77c-62c2a288.jpg   clear  dawn/dusk   
2  0000f77c-cb820c98.json  0000f77c-cb820c98.jpg   clear  dawn/dusk   
3  0001542f-5ce3cf52.json  0001542f-5ce3cf52.jpg   clear      night   
4  0001542f-7c670be8.json  0001542f-7c670be8.jpg   clear      night   

         scene  
0  city street  
1      highway  
2  residential  
3  city street  
4      highway  


##### 2. Data Sampling 
1.  Filter for weather == 'clear'.
2.  Stratify by timeofday:
    *   Daytime: ~2,500 (50%)
    *   Night: ~2,000 (40%)
    *   Dawn/Dusk: ~500 (10%)
3.  Total: 5,000 images.

In [None]:
# 1. Filter for Clear Weather
df_clear = df_metadata[df_metadata['weather'] == 'clear'].copy()
print(f"Total 'clear' weather images: {len(df_clear)}")


# Targets
TARGET_TOTAL = 5000
TARGET_DAY = 2500
TARGET_NIGHT = 2000
TARGET_DAWN_DUSK = 500

# Separate by time
df_day = df_clear[df_clear['timeofday'] == 'daytime']
df_night = df_clear[df_clear['timeofday'] == 'night']
df_dawn_dusk = df_clear[df_clear['timeofday'].isin(['dawn/dusk'])]

# Check availability
print(f"Available Daytime: {len(df_day)}")
print(f"Available Night: {len(df_night)}")
print(f"Available Dawn/Dusk: {len(df_dawn_dusk)}")

# Sample
# Use min() to avoid error if not enough data
sample_day = df_day.sample(n=min(len(df_day), TARGET_DAY), random_state=42)
sample_night = df_night.sample(n=min(len(df_night), TARGET_NIGHT), random_state=42)
sample_dawn_dusk = df_dawn_dusk.sample(n=min(len(df_dawn_dusk), TARGET_DAWN_DUSK), random_state=42)

# Combine
df_sampled = pd.concat([sample_day, sample_night, sample_dawn_dusk])
print(f"Selected {len(df_sampled)} images for the dataset.")
print(df_sampled['timeofday'].value_counts())

Total 'clear' weather images: 37411
Available Daytime: 12477
Available Night: 22928
Available Dawn/Dusk: 2004
Selected 5000 images for the dataset.
timeofday
daytime      2500
night        2000
dawn/dusk     500
Name: count, dtype: int64


##### 3. Split Dataset
Split the 5,000 images into:
*   Train: 70%
*   Test: 20%
*   Validation: 10%

In [4]:
# First split: Train (70%) vs Temp (30%)
train_df, temp_df = train_test_split(df_sampled, test_size=0.3, random_state=42, stratify=df_sampled['timeofday'])

# Second split: Temp into Test (20% of total -> 2/3 of Temp) and Val (10% of total -> 1/3 of Temp)
test_df, val_df = train_test_split(temp_df, test_size=1/3, random_state=42, stratify=temp_df['timeofday'])

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Val set size: {len(val_df)}")

Train set size: 3500
Test set size: 1000
Val set size: 500


##### 4. Create Directory Structure and Process Data


In [5]:
def create_yolo_label(json_path, class_mapping, img_width, img_height):
    """
    Reads a BDD100K JSON label file and converts it to YOLO format string.
    """
    yolo_lines = []
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            
        frames = data.get('frames', [])
        if not frames:
            return []
            
        # BDD100K usually has objects in the first frame for single image datasets
        objects = frames[0].get('objects', [])
        
        for obj in objects:
            category = obj.get('category')
            
            if category in class_mapping:
                class_id = class_mapping[category]
                
                # Get box2d
                box2d = obj.get('box2d')
                if box2d:
                    x1 = box2d['x1']
                    y1 = box2d['y1']
                    x2 = box2d['x2']
                    y2 = box2d['y2']
                    
                    # Convert to YOLO format (center_x, center_y, width, height) normalized
                    # Ensure coordinates are within image bounds
                    x1 = max(0, min(x1, img_width))
                    x2 = max(0, min(x2, img_width))
                    y1 = max(0, min(y1, img_height))
                    y2 = max(0, min(y2, img_height))
                    
                    bw = x2 - x1
                    bh = y2 - y1
                    
                    if bw <= 0 or bh <= 0:
                        continue
                        
                    bx = (x1 + x2) / 2.0
                    by = (y1 + y2) / 2.0
                    
                    # Normalize
                    nw = bw / img_width
                    nh = bh / img_height
                    nx = bx / img_width
                    ny = by / img_height
                    
                    yolo_lines.append(f"{class_id} {nx:.6f} {ny:.6f} {nw:.6f} {nh:.6f}")
                    
    except Exception as e:
        print(f"Error processing label {json_path}: {e}")
        return []
        
    return yolo_lines

def process_dataset(df, split_name):
    """
    Process a dataframe of images: copy image, create label file.
    split_name: 'train', 'valid', or 'test'
    """
    print(f"Processing {split_name} set ({len(df)} images)...")
    
    # Create directories with the requested structure:
    # dataset/
    # ├── train/
    # │   ├── images/
    # │   └── labels/
    
    split_dir = os.path.join(DEST_DIR, split_name)
    img_dest_dir = os.path.join(split_dir, 'images')
    lbl_dest_dir = os.path.join(split_dir, 'labels')
    
    os.makedirs(img_dest_dir, exist_ok=True)
    os.makedirs(lbl_dest_dir, exist_ok=True)
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        json_filename = row['file_name']
        image_filename = row['image_name']
        
        # Source paths
        json_path = os.path.join(SOURCE_LABELS_DIR, json_filename)
        img_path = os.path.join(SOURCE_IMAGES_DIR, image_filename)
        
        # Check if image exists
        if not os.path.exists(img_path):
            # Try to find the image if the extension is different or missing
            if not image_filename.endswith('.jpg'):
                 image_filename = image_filename.replace('.json', '.jpg')
                 img_path = os.path.join(SOURCE_IMAGES_DIR, image_filename)
            
            if not os.path.exists(img_path):
                print(f"Warning: Image not found {img_path}")
                continue
        
        # Generate YOLO label content
        yolo_lines = create_yolo_label(json_path, CLASS_MAPPING, IMG_WIDTH, IMG_HEIGHT)
        
        # Destination paths
        dest_img_path = os.path.join(img_dest_dir, image_filename)
        dest_lbl_path = os.path.join(lbl_dest_dir, image_filename.replace('.jpg', '.txt'))
        
        # Copy Image
        try:
            shutil.copy2(img_path, dest_img_path)
        except Exception as e:
            print(f"Failed to copy image {img_path}: {e}")
            continue
            
        # Write Label
        try:
            with open(dest_lbl_path, 'w') as f:
                f.write('\n'.join(yolo_lines))
        except Exception as e:
            print(f"Failed to write label {dest_lbl_path}: {e}")

# Execute processing with the requested split names
# Note: User requested 'valid' for validation set
process_dataset(train_df, 'train')
process_dataset(val_df, 'valid')
process_dataset(test_df, 'test')

print("\nDataset preparation complete!")

Processing train set (3500 images)...


100%|██████████| 3500/3500 [00:49<00:00, 71.27it/s]


Processing valid set (500 images)...


100%|██████████| 500/500 [00:07<00:00, 70.53it/s]


Processing test set (1000 images)...


100%|██████████| 1000/1000 [00:13<00:00, 72.19it/s]


Dataset preparation complete!





## 5. Create data.yaml
Create the `data.yaml` file required by YOLOv11 training.

In [None]:
yaml_content = f"""
path: {DEST_DIR} # dataset root dir
train: train/images # train images (relative to 'path')
val: valid/images # val images (relative to 'path')
test: test/images # test images (optional)

nc: {len(CLASS_MAPPING)} # number of classes
names: {['car', 'person', 'truck', 'motor', 'rider', 'bus', 'train']} 
"""

yaml_path = os.path.join(DEST_DIR, 'data.yaml')
with open(yaml_path, 'w') as f:
    f.write(yaml_content)

print(f"Created data.yaml at {yaml_path}")
print(yaml_content)

Created data.yaml at d:\bdd100k\yolov11_dataset\data.yaml

path: d:\bdd100k\yolov11_dataset # dataset root dir
train: train/images # train images (relative to 'path')
val: valid/images # val images (relative to 'path')
test: test/images # test images (optional)

nc: 7 # number of classes
names: ['car', 'person', 'truck', 'motor', 'rider', 'bus', 'train'] # class names

