In [1]:

# @markdown Name this new dataset:
dataset = '' # @param {type:'string'}

# @title ## STEP 1 : Dataset Parameters
# @markdown Select the things you want your model to detect:
# @markdown Animals:
person = False # @param {type:'boolean'}
cat = False # @param {type:'boolean'}
dog = False # @param {type:'boolean'}
horse = False # @param {type:'boolean'}
cow = False # @param {type:'boolean'}
elephant = False # @param {type:'boolean'}
bear = False # @param {type:'boolean'}
zebra = False # @param {type:'boolean'}
giraffe = False # @param {type:'boolean'}
# @markdown Vehicles:
car = False # @param {type:'boolean'}
truck = False # @param {type:'boolean'}
bus = False # @param {type:'boolean'}
boat = False # @param {type:'boolean'}
motorcycle = False # @param {type:'boolean'}
airplane = False # @param {type:'boolean'}
potted_plant = False # @param {type:'boolean'}
train = False # @param {type:'boolean'}
# @markdown Things:
toilet = False # @param {type:'boolean'}
skateboard = False # @param {type:'boolean'}
knife = False # @param {type:'boolean'}
bicycle = False # @param {type:'boolean'}
umbrella = False # @param {type:'boolean'}

# @markdown Choose the maximum number of videos you want to download:
# @markdown (some may have been taken off of YouTube, etc..)
max_videos_to_download = 0 # @param {type:'integer'}

# @markdown Enter percentage of data to use for train set:
training_percent = 70 # @param {type: 'integer'}
# @markdown What to do with the rest?
half_half = False # @param ['Use it all for val set', 'Half val set, half test set'] {type: 'string'}

# @markdown Shuffle videos before downloading?
# @markdown (recommended unless you want a very unbalanced dataset)
shuffle = True # @param {type:'bool'}

In [2]:
# @title ## STEP 2 : Model Selection
# @markdown Choose which size of YOLOv5 model to use:
network_size = 'small' # @param ['small', 'medium', 'large', 'xl'] {type:'string'}

In [None]:
# @title ## STEP 3 : Environment Setup
# @markdown Choose your compute environment:
use_colab = True # @param {type:'boolean'}

import os
import subprocess

if use_colab:
    print("setting up google colab environment...")
    # Mount Google Drive
    try:
        from google.colab import drive, files
        drive.mount('/content/drive')
        print("google drive mounted successfully")
        
        # Set paths for Colab
        data_base_path = '/content/drive/MyDrive/yolov5_data'
        os.makedirs(data_base_path, exist_ok=True)
        
    except ImportError:
        print("warning: not in colab environment but use_colab=True")
        data_base_path = '../data'
else:
    print("setting up local environment...")
    data_base_path = '../data'
    
    # Check if running locally
    try:
        from google.colab import files
        print("warning: in colab but use_colab=False, switching to colab mode")
        use_colab = True
        data_base_path = '/content/drive/MyDrive/yolov5_data'
    except ImportError:
        print("local environment detected")

print(f"data will be stored at: {data_base_path}")
print(f"environment: {'colab' if use_colab else 'local'}")

In [None]:
# @title ## STEP 4 : Validate Configuration

# Collect selected classes
class_vars = {
    'person': person, 'cat': cat, 'dog': dog, 'horse': horse, 'cow': cow,
    'elephant': elephant, 'bear': bear, 'zebra': zebra, 'giraffe': giraffe,
    'car': car, 'truck': truck, 'bus': bus, 'boat': boat, 'motorcycle': motorcycle,
    'airplane': airplane, 'train': train, 'bicycle': bicycle, 'potted_plant': potted_plant,
    'toilet': toilet, 'skateboard': skateboard, 'knife': knife, 'umbrella': umbrella
}

selected_classes = [name for name, selected in class_vars.items() if selected]

# Validation
if not dataset.strip():
    print("error: please provide a dataset name")
elif not selected_classes:
    print("error: please select at least one object class")
elif max_videos_to_download <= 0:
    print("error: please set max_videos_to_download to a positive number")
else:
    print("configuration validated successfully")
    print(f"\ndataset: {dataset}")
    print(f"classes: {', '.join(selected_classes)} ({len(selected_classes)} total)")
    print(f"max videos: {max_videos_to_download}")
    print(f"training %: {training_percent}%")
    print(f"model size: {network_size}")
    print(f"environment: {'colab' if use_colab else 'local'}")
    print(f"\nready to proceed")

In [None]:
# @title ## STEP 5 : Create Dataset
# @markdown This modifies and runs your process-data.py script

import os
import subprocess

# COCO class ID mapping for YouTube-BB
YTBB_CLASS_MAPPING = {
    'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4, 'airplane': 5,
    'bus': 6, 'train': 7, 'truck': 8, 'boat': 9, 'skateboard': 37,
    'cat': 17, 'dog': 18, 'horse': 19, 'cow': 21, 'elephant': 22,
    'bear': 23, 'zebra': 24, 'giraffe': 25, 'potted_plant': 64,
    'toilet': 70, 'knife': 49, 'umbrella': 28
}

def create_class_remapping(selected_classes):
    ytbb_ids = [YTBB_CLASS_MAPPING[cls] for cls in selected_classes]
    return {ytbb_id: idx for idx, ytbb_id in enumerate(ytbb_ids)}

def create_yaml_config(dataset_name, selected_classes):
    # Adjust paths based on environment
    if use_colab:
        data_path = f"{data_base_path}/processed/{dataset_name}/data/"
    else:
        data_path = f"../../data/processed/{dataset_name}/data/"
    
    yaml_content = f"""# -- written by cwa --

#relative paths (from train.py)
path: {data_path}
train: images/train
val: images/val
test: images/test

# # of classes
nc: {len(selected_classes)}

# class names
names: {selected_classes}
"""
    
    yaml_path = f"yolov5/dataset-{dataset_name}.yaml"
    with open(yaml_path, 'w') as f:
        f.write(yaml_content)
    return yaml_path

def modify_process_script():
    # Read original script
    with open('src/utility/process-data.py', 'r') as f:
        script = f.read()
    
    # Calculate ratios
    train_ratio = training_percent / 100.0
    remaining = 1.0 - train_ratio
    if half_half == 'Half val set, half test set':
        val_ratio = remaining / 2.0
        test_ratio = remaining / 2.0
    else:
        val_ratio = remaining
        test_ratio = 0.0
    
    class_remapping = create_class_remapping(selected_classes)
    
    # Adjust data paths for environment
    if use_colab:
        raw_data_path = f"{data_base_path}/raw/"
        processed_data_path = f"{data_base_path}/processed/"
    else:
        raw_data_path = "../../data/raw/"
        processed_data_path = "../../data/processed/"
    
    # Replace parameters
    replacements = {
        "dataset = 'extra'": f"dataset = '{dataset}'",
        "classes = ['dog']": f"classes = {selected_classes}",
        "max_videos_to_download = 50": f"max_videos_to_download = {max_videos_to_download}",
        "class_remapping = {19: 0}": f"class_remapping = {class_remapping}",
        "train_ratio = 0.68": f"train_ratio = {train_ratio}",
        "val_ratio = 0.16": f"val_ratio = {val_ratio}",
        "test_ratio = 0.16": f"test_ratio = {test_ratio}",
        "shuffle = True": f"shuffle = {shuffle}",
        "'../../data/raw/'": f"'{raw_data_path}'",
        "'../../data/processed/'": f"'{processed_data_path}'"
    }
    
    for old, new in replacements.items():
        script = script.replace(old, new)
    
    # Save modified script
    with open('process_data_notebook.py', 'w') as f:
        f.write(script)

# Create YAML config
yaml_path = create_yaml_config(dataset, selected_classes)
print(f"created yaml config: {yaml_path}")

# Modify processing script
modify_process_script()
print(f"created modified processing script: process_data_notebook.py")

print(f"\nready to run data processing")

In [None]:
# @title ## STEP 6 : Run Data Processing
# @markdown this will take several hours

start_processing = False # @param {type:'boolean'}

if start_processing:
    print("starting data processing...")
    print("this will download videos, extract frames, and create annotations")
    print("progress will be shown below\n")
    
    # Make sure data directories exist
    raw_dir = f"{data_base_path}/raw"
    processed_dir = f"{data_base_path}/processed"
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(processed_dir, exist_ok=True)
    
    try:
        result = subprocess.run(['python', 'process_data_notebook.py'], 
                              capture_output=False, text=True)
        if result.returncode == 0:
            print("\ndata processing completed successfully")
        else:
            print(f"\ndata processing failed with return code: {result.returncode}")
    except Exception as e:
        print(f"error: {e}")
else:
    print("set 'start_processing' to True to begin data processing")
    raw_path = f"{data_base_path}/raw" if use_colab else "../../data/raw/"
    print(f"make sure you have youtube-bb csv files in {raw_path}")

In [None]:
# @title ## STEP 7 : Train Model

# @markdown training parameters:
epochs = 100 # @param {type:'integer'}
batch_size = 16 # @param [8, 16, 32, 64]
start_training = False # @param {type:'boolean'}

if start_training:
    # Get model name
    model_map = {'small': 'yolov5s', 'medium': 'yolov5m', 'large': 'yolov5l', 'xl': 'yolov5x'}
    model_name = model_map.get(network_size, 'yolov5s')
    
    print(f"starting training with {model_name}...")
    
    # Change to yolov5 directory
    original_dir = os.getcwd()
    os.chdir('yolov5')
    
    # Training command
    cmd = [
        'python', 'train.py',
        '--data', f'dataset-{dataset}.yaml',
        '--weights', f'{model_name}.pt',
        '--epochs', str(epochs),
        '--batch-size', str(batch_size),
        '--name', f'{dataset}_training'
    ]
    
    try:
        subprocess.run(cmd, check=True)
        print("\ntraining completed")
    except subprocess.CalledProcessError as e:
        print(f"training failed: {e}")
    finally:
        os.chdir(original_dir)
else:
    print("set 'start_training' to True to begin training")
    print(f"will train {network_size} model for {epochs} epochs with batch size {batch_size}")

In [None]:
# @title ## STEP 8 : Visual Inspection
# @markdown use your existing visual-check.py utility

run_visual_check = False # @param {type:'boolean'}
subset = "train" # @param ["train", "val", "test"]

if run_visual_check:
    # Modify and run your visual-check.py
    with open('src/utility/visual-check.py', 'r') as f:
        script = f.read()
    
    # Replace parameters
    script = script.replace("dataset = 'ytbb_cat'", f"dataset = '{dataset}'")
    script = script.replace("subset = 'train'", f"subset = '{subset}'")
    
    # Adjust paths for environment
    if use_colab:
        old_path = "'../../data/processed/'"
        new_path = f"'{data_base_path}/processed/'"
        script = script.replace(old_path, new_path)
    
    # Execute
    exec(script)
else:
    print("set 'run_visual_check' to True to inspect your dataset")
    print("this will show a random image with bounding box overlays")

In [None]:
# @title ## STEP 9 : Test Your Model
# @markdown upload an image or video to test your trained model

test_type = "image" # @param ["image", "video"]
run_detection = False # @param {type:'boolean'}

if run_detection:
    model_path = f"yolov5/runs/train/{dataset}_training/weights/best.pt"
    
    if os.path.exists(model_path):
        print(f"running detection on {test_type}...")
        
        # Upload file based on environment
        if use_colab:
            from google.colab import files
            print(f"please upload your {test_type} file:")
            uploaded = files.upload()
            
            if uploaded:
                filename = list(uploaded.keys())[0]
                print(f"uploaded: {filename}")
            else:
                print("no file uploaded")
                filename = None
        else:
            # For local, user needs to provide path
            import tkinter as tk
            from tkinter import filedialog
            
            root = tk.Tk()
            root.withdraw()
            
            if test_type == "image":
                filetypes = [("Image files", "*.jpg *.jpeg *.png *.bmp")]
            else:
                filetypes = [("Video files", "*.mp4 *.avi *.mov *.mkv")]
                
            filename = filedialog.askopenfilename(filetypes=filetypes)
            
            if filename:
                print(f"selected: {filename}")
            else:
                print("no file selected")
        
        if filename:
            # Change to yolov5 directory
            original_dir = os.getcwd()
            os.chdir('yolov5')
            
            # Detection command
            output_name = f'{dataset}_detection_{test_type}'
            cmd = [
                'python', 'detect.py',
                '--weights', f'runs/train/{dataset}_training/weights/best.pt',
                '--source', f'../{filename}',
                '--name', output_name,
                '--save-txt',
                '--save-conf'
            ]
            
            try:
                subprocess.run(cmd, check=True)
                output_dir = f"runs/detect/{output_name}"
                print(f"\ndetection complete! check {output_dir}/")
                
                # For Colab, provide download link
                if use_colab:
                    import glob
                    result_files = glob.glob(f"{output_dir}/*")
                    if result_files:
                        print("\ndownloading result files...")
                        for file_path in result_files:
                            if os.path.isfile(file_path):
                                files.download(file_path)
                        
            except subprocess.CalledProcessError as e:
                print(f"detection failed: {e}")
            finally:
                os.chdir(original_dir)
                
    else:
        print(f"model not found: {model_path}")
        print("please complete training first")
else:
    print("set 'run_detection' to True to test your model")
    print(f"will run inference on uploaded {test_type}")
    print(f"model location: yolov5/runs/train/{dataset}_training/weights/best.pt")