In [6]:
import os
import json

import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset

import tqdm


In [7]:
# This is from the "eval.py" file.
def set_seed(seed: int):
    torch.manual_seed(seed)
    # NOTE: this only sets on current cuda device
    torch.cuda.manual_seed(seed)

    
def load_original_model_and_inputs(
    model_original_src: str, context: dict
) -> tuple[nn.Module, callable, callable]:
    """
    Load class from original NN.module pytorch code
    this is pytorch reference and we feed that to model to see if there will be any improvement
    """

    try:
        compile(model_original_src, "<string>", "exec")
    except SyntaxError as e:
        print(f"Syntax Error in original code {e}")
        return None

    try:
        exec(model_original_src, context)  # expose to current namespace
    except Exception as e:
        print(f"Error in executing original code {e}")
        return None

    # these should be defined in the original model code and present in the context
    get_init_inputs_fn = context.get("get_init_inputs")
    get_inputs_fn = context.get("get_inputs")
    Model = context.get("Model")
    return (Model, get_init_inputs_fn, get_inputs_fn)

def time_execution_with_cuda_event(
    kernel_fn: callable,
    *args,
    num_warmup: int = 3,
    num_trials: int = 10,
    verbose: bool = True,
    device: torch.device = None,
) -> list[float]:
    """
    Time a CUDA kernel function over multiple trials using torch.cuda.Event

    Args:
        kernel_fn: Function to time
        *args: Arguments to pass to kernel_fn
        num_trials: Number of timing trials to run
        verbose: Whether to print per-trial timing info
        device: CUDA device to use, if None, use current device

    Returns:
        List of elapsed times in milliseconds
    """
    if device is None:
        if verbose:
            print(f"Using current device: {torch.cuda.current_device()}")
        device = torch.cuda.current_device()

    # Warm ups
    for _ in range(num_warmup):
        kernel_fn(*args)
        torch.cuda.synchronize(device=device)

    print(
        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
    )
    elapsed_times = []

    # Actual trials
    for trial in range(num_trials):
        # create event marker default is not interprocess
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        start_event.record()
        kernel_fn(*args)
        end_event.record()

        # Synchronize to ensure the events have completed
        torch.cuda.synchronize(device=device)

        # Calculate the elapsed time in milliseconds
        elapsed_time_ms = start_event.elapsed_time(end_event)
        if verbose:
            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
        elapsed_times.append(elapsed_time_ms)

    return elapsed_times

def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> dict:
    """Get timing statistics from a list of elapsed times.

    Args:
        elapsed_times: List of elapsed times in milliseconds
        device: CUDA device, record device info
    Returns:
        Dict containing mean, std, min, max and num_trials
        all timing are in ms
    """

    stats = {
        "mean": float(f"{np.mean(elapsed_times):.3g}"),
        "std": float(f"{np.std(elapsed_times):.3g}"),
        "min": float(f"{np.min(elapsed_times):.3g}"),
        "max": float(f"{np.max(elapsed_times):.3g}"),
        "num_trials": len(elapsed_times),
    }

    if device:
        stats["hardware"] = torch.cuda.get_device_name(device=device)
        stats["device"] = str(device)  # for debugging

    return stats

In [8]:
# From the original file "generate_baseline_time.py"
def measure_program_time(
        ref_arch_name: str,
        ref_arch_src: str, 
        num_trials: int = 100,
        use_torch_compile: bool = False,
        torch_compile_backend: str="inductor", 
        torch_compile_options: str="default",
        device: torch.device="cuda:0",
        verbose: bool = False,
) -> dict:
    """
    Measure the time of a KernelBench reference architecture
    """
    context = {}
    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
        ref_arch_src, context
    )
    try:
        with torch.no_grad():
            torch.cuda.synchronize(device=device)
            set_seed(42)
            inputs = get_inputs()
            set_seed(42)
            init_inputs = get_init_inputs()
            inputs = [
                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
                for x in inputs
            ]
            init_inputs = [
                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
                for x in init_inputs
            ]

            # Initialize PyTorch model, use this for eager mode execution
            model = Model(*init_inputs)
            
            if use_torch_compile:
                print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
                model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
            else:
                print(f"Using PyTorch Eager Execution on {ref_arch_name}")
            
            model = model.cuda(device=device)
            torch.cuda.synchronize(device=device)
            elapsed_times = time_execution_with_cuda_event(
                model, *inputs, num_trials=num_trials, verbose=verbose, device=device
            )
            runtime_stats = get_timing_stats(elapsed_times, device=device)

            if verbose:
                print(f"{ref_arch_name} {runtime_stats}")
            
            return runtime_stats
    except Exception as e:
        print(f"[Eval] Error in Measuring Performance: {e}")

In [None]:
TIMING_DIR = "./timings"
ds = load_dataset("ai-nikolai/KernelBench")

#SOME INIT PARAMS
file_name: str="baseline_time.json"
num_trials: int= 100

use_torch_compile: bool = False
torch_compile_backend: str="inductor"
torch_compile_options: str="default"

device = torch.device("cuda:0")
json_results = {}

#
level=1
level_1 = ds[f"level_{level}"]
num_problems = len(level_1)

json_results[f"level_{level}"]={}

for sample in tqdm.tqdm(level_1):
    ref_arch_src, ref_arch_name  = sample["code"], sample["name"]
    runtime_stats = measure_program_time(
        ref_arch_name=ref_arch_name,
        ref_arch_src=ref_arch_src,
        use_torch_compile=use_torch_compile,
        torch_compile_backend=torch_compile_backend,
        torch_compile_options=torch_compile_options,
        device=device,
        verbose=False, # do not print 
        num_trials=num_trials,
    )
    json_results[f"level_{level}"][ref_arch_name] = runtime_stats

save_path = os.path.join(TIMING_DIR, file_name)
os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, "w") as f:
    json.dump(json_results, f)


  0%|          | 0/100 [00:00<?, ?it/s]

Using PyTorch Eager Execution on 100_HingeLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  0%|          | 0/100 [00:13<?, ?it/s]


KeyError: 'level1'

In [None]:
# ds["level_1"]
# ds.push_to_hub("KernelBench")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 24.96ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.14s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 483.33ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.15 shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 333.04ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.16s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 518.90ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.20 shards/s]


CommitInfo(commit_url='https://huggingface.co/datasets/ai-nikolai/KernelBench/commit/d55e90db696ff59275c3754da6d68471b106ab13', commit_message='Upload dataset', commit_description='', oid='d55e90db696ff59275c3754da6d68471b106ab13', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ai-nikolai/KernelBench', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ai-nikolai/KernelBench'), pr_revision=None, pr_num=None)