In [6]:
import os
import json

import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset

import tqdm


In [7]:
# This is from the "eval.py" file.
def set_seed(seed: int):
    torch.manual_seed(seed)
    # NOTE: this only sets on current cuda device
    torch.cuda.manual_seed(seed)

    
def load_original_model_and_inputs(
    model_original_src: str, context: dict
) -> tuple[nn.Module, callable, callable]:
    """
    Load class from original NN.module pytorch code
    this is pytorch reference and we feed that to model to see if there will be any improvement
    """

    try:
        compile(model_original_src, "<string>", "exec")
    except SyntaxError as e:
        print(f"Syntax Error in original code {e}")
        return None

    try:
        exec(model_original_src, context)  # expose to current namespace
    except Exception as e:
        print(f"Error in executing original code {e}")
        return None

    # these should be defined in the original model code and present in the context
    get_init_inputs_fn = context.get("get_init_inputs")
    get_inputs_fn = context.get("get_inputs")
    Model = context.get("Model")
    return (Model, get_init_inputs_fn, get_inputs_fn)

def time_execution_with_cuda_event(
    kernel_fn: callable,
    *args,
    num_warmup: int = 3,
    num_trials: int = 10,
    verbose: bool = True,
    device: torch.device = None,
) -> list[float]:
    """
    Time a CUDA kernel function over multiple trials using torch.cuda.Event

    Args:
        kernel_fn: Function to time
        *args: Arguments to pass to kernel_fn
        num_trials: Number of timing trials to run
        verbose: Whether to print per-trial timing info
        device: CUDA device to use, if None, use current device

    Returns:
        List of elapsed times in milliseconds
    """
    if device is None:
        if verbose:
            print(f"Using current device: {torch.cuda.current_device()}")
        device = torch.cuda.current_device()

    # Warm ups
    for _ in range(num_warmup):
        kernel_fn(*args)
        torch.cuda.synchronize(device=device)

    print(
        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
    )
    elapsed_times = []

    # Actual trials
    for trial in range(num_trials):
        # create event marker default is not interprocess
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        start_event.record()
        kernel_fn(*args)
        end_event.record()

        # Synchronize to ensure the events have completed
        torch.cuda.synchronize(device=device)

        # Calculate the elapsed time in milliseconds
        elapsed_time_ms = start_event.elapsed_time(end_event)
        if verbose:
            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
        elapsed_times.append(elapsed_time_ms)

    return elapsed_times

def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> dict:
    """Get timing statistics from a list of elapsed times.

    Args:
        elapsed_times: List of elapsed times in milliseconds
        device: CUDA device, record device info
    Returns:
        Dict containing mean, std, min, max and num_trials
        all timing are in ms
    """

    stats = {
        "mean": float(f"{np.mean(elapsed_times):.3g}"),
        "std": float(f"{np.std(elapsed_times):.3g}"),
        "min": float(f"{np.min(elapsed_times):.3g}"),
        "max": float(f"{np.max(elapsed_times):.3g}"),
        "num_trials": len(elapsed_times),
    }

    if device:
        stats["hardware"] = torch.cuda.get_device_name(device=device)
        stats["device"] = str(device)  # for debugging

    return stats

In [8]:
# From the original file "generate_baseline_time.py"
def measure_program_time(
        ref_arch_name: str,
        ref_arch_src: str, 
        num_trials: int = 100,
        use_torch_compile: bool = False,
        torch_compile_backend: str="inductor", 
        torch_compile_options: str="default",
        device: torch.device="cuda:0",
        verbose: bool = False,
) -> dict:
    """
    Measure the time of a KernelBench reference architecture
    """
    context = {}
    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
        ref_arch_src, context
    )
    try:
        with torch.no_grad():
            torch.cuda.synchronize(device=device)
            set_seed(42)
            inputs = get_inputs()
            set_seed(42)
            init_inputs = get_init_inputs()
            inputs = [
                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
                for x in inputs
            ]
            init_inputs = [
                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
                for x in init_inputs
            ]

            # Initialize PyTorch model, use this for eager mode execution
            model = Model(*init_inputs)
            
            if use_torch_compile:
                print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
                model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
            else:
                print(f"Using PyTorch Eager Execution on {ref_arch_name}")
            
            model = model.cuda(device=device)
            torch.cuda.synchronize(device=device)
            elapsed_times = time_execution_with_cuda_event(
                model, *inputs, num_trials=num_trials, verbose=verbose, device=device
            )
            runtime_stats = get_timing_stats(elapsed_times, device=device)

            if verbose:
                print(f"{ref_arch_name} {runtime_stats}")
            
            return runtime_stats
    except Exception as e:
        print(f"[Eval] Error in Measuring Performance: {e}")

In [12]:
#Additional Functions (eval.py and others?)

def gpu_cache_clean(device: torch.device = torch.device("cuda:0")):
    """Cleans up the GPU cache."""
    # Clear CUDA cache and reset GPU state
    with torch.cuda.device(device):
        torch.cuda.empty_cache()

        # does this help?
        torch.cuda.reset_peak_memory_stats(device=device)

        torch.cuda.synchronize(
            device=device
        )  # Wait for all CUDA operations to complete

def graceful_eval_cleanup(curr_context: dict, device: torch.device):
    """
    Clean up env, gpu cache, and compiled CUDA extensions after evaluation
    """  # delete ran-specific function definitions before next eval run
    del curr_context
    gpu_cache_clean(device=device)


In [13]:
gpu_cache_clean()
TIMING_DIR = "./timings"
file_name: str="baseline_time_v2.json"


# MAIN SCRIPT
ds = load_dataset("ai-nikolai/KernelBench")

#SOME INIT PARAMS
num_trials: int= 3

use_torch_compile: bool = False
torch_compile_backend: str="inductor"
torch_compile_options: str="default"


device = torch.device("cuda:0")
json_results = {}

#
level=1
level_1 = ds[f"level_{level}"]
num_problems = len(level_1)

json_results[f"level_{level}"]={}

for sample in tqdm.tqdm(level_1):
    ref_arch_src, ref_arch_name  = sample["code"], sample["name"]
    runtime_stats = measure_program_time(
        ref_arch_name=ref_arch_name,
        ref_arch_src=ref_arch_src,
        use_torch_compile=use_torch_compile,
        torch_compile_backend=torch_compile_backend,
        torch_compile_options=torch_compile_options,
        device=device,
        verbose=False, # do not print 
        num_trials=num_trials,
    )
    json_results[f"level_{level}"][ref_arch_name] = runtime_stats

save_path = os.path.join(TIMING_DIR, file_name)
os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, "w") as f:
    json.dump(json_results, f)


  0%|          | 0/100 [00:00<?, ?it/s]

Using PyTorch Eager Execution on 100_HingeLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  2%|▏         | 2/100 [00:08<05:40,  3.47s/it]

Using PyTorch Eager Execution on 10_3D_tensor_matrix_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  3%|▎         | 3/100 [00:10<04:19,  2.67s/it]

Using PyTorch Eager Execution on 11_4D_tensor_matrix_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  4%|▍         | 4/100 [00:10<02:40,  1.68s/it]

Using PyTorch Eager Execution on 12_Matmul_with_diagonal_matrices_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  5%|▌         | 5/100 [00:10<01:59,  1.26s/it]

Using PyTorch Eager Execution on 13_Matmul_for_symmetric_matrices
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  6%|▌         | 6/100 [00:11<01:27,  1.07it/s]

Using PyTorch Eager Execution on 14_Matmul_for_upper_triangular_matrices
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  7%|▋         | 7/100 [00:11<01:10,  1.32it/s]

Using PyTorch Eager Execution on 15_Matmul_for_lower_triangular_matrices
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  8%|▊         | 8/100 [00:11<00:59,  1.56it/s]

Using PyTorch Eager Execution on 16_Matmul_with_transposed_A
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


  9%|▉         | 9/100 [00:12<00:51,  1.76it/s]

Using PyTorch Eager Execution on 17_Matmul_with_transposed_B
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 10%|█         | 10/100 [00:12<00:46,  1.95it/s]

Using PyTorch Eager Execution on 18_Matmul_with_transposed_both
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 11%|█         | 11/100 [00:24<05:56,  4.01s/it]

Using PyTorch Eager Execution on 19_ReLU
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 12%|█▏        | 12/100 [00:24<04:12,  2.87s/it]

Using PyTorch Eager Execution on 1_Square_matrix_multiplication_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 13%|█▎        | 13/100 [00:35<07:38,  5.27s/it]

Using PyTorch Eager Execution on 20_LeakyReLU
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 14%|█▍        | 14/100 [00:45<09:42,  6.78s/it]

Using PyTorch Eager Execution on 21_Sigmoid
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 15%|█▌        | 15/100 [00:55<10:49,  7.64s/it]

Using PyTorch Eager Execution on 22_Tanh
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 23_Softmax
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 16%|█▌        | 16/100 [01:05<11:50,  8.46s/it]

Using PyTorch Eager Execution on 24_LogSoftmax
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 17%|█▋        | 17/100 [01:16<12:24,  8.98s/it]

Using PyTorch Eager Execution on 25_Swish
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 19%|█▉        | 19/100 [01:38<13:34, 10.05s/it]

Using PyTorch Eager Execution on 26_GELU_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 20%|██        | 20/100 [01:49<13:58, 10.49s/it]

Using PyTorch Eager Execution on 27_SELU_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 21%|██        | 21/100 [02:00<13:49, 10.50s/it]

Using PyTorch Eager Execution on 28_HardSigmoid
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 22%|██▏       | 22/100 [02:10<13:45, 10.58s/it]

Using PyTorch Eager Execution on 29_Softplus
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 23%|██▎       | 23/100 [02:11<09:39,  7.53s/it]

Using PyTorch Eager Execution on 2_Standard_matrix_multiplication_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 30_Softsign
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 25%|██▌       | 25/100 [02:33<11:29,  9.20s/it]

Using PyTorch Eager Execution on 31_ELU
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 26%|██▌       | 26/100 [02:43<11:51,  9.62s/it]

Using PyTorch Eager Execution on 32_HardTanh
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 27%|██▋       | 27/100 [02:50<10:49,  8.90s/it]

Using PyTorch Eager Execution on 33_BatchNorm
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 34_InstanceNorm


 28%|██▊       | 28/100 [03:03<12:02, 10.04s/it]

[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 35_GroupNorm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 29%|██▉       | 29/100 [03:15<12:38, 10.68s/it]

Using PyTorch Eager Execution on 36_RMSNorm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 30%|███       | 30/100 [03:27<12:55, 11.08s/it]

Using PyTorch Eager Execution on 37_FrobeniusNorm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 31%|███       | 31/100 [03:40<13:09, 11.44s/it]

Using PyTorch Eager Execution on 38_L1Norm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 32%|███▏      | 32/100 [03:53<13:44, 12.12s/it]

Using PyTorch Eager Execution on 39_L2Norm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 34%|███▍      | 34/100 [04:09<10:17,  9.35s/it]

Using PyTorch Eager Execution on 3_Batched_matrix_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 35%|███▌      | 35/100 [04:09<07:16,  6.71s/it]

Using PyTorch Eager Execution on 40_LayerNorm
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 36%|███▌      | 36/100 [04:15<06:52,  6.45s/it]

Using PyTorch Eager Execution on 41_Max_Pooling_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 37%|███▋      | 37/100 [04:18<05:49,  5.55s/it]

Using PyTorch Eager Execution on 42_Max_Pooling_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 38%|███▊      | 38/100 [04:25<06:05,  5.89s/it]

Using PyTorch Eager Execution on 43_Max_Pooling_3D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 39%|███▉      | 39/100 [04:29<05:13,  5.14s/it]

Using PyTorch Eager Execution on 44_Average_Pooling_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 45_Average_Pooling_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 41%|████      | 41/100 [05:09<12:00, 12.21s/it]

Using PyTorch Eager Execution on 46_Average_Pooling_3D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 42%|████▏     | 42/100 [05:22<12:05, 12.50s/it]

Using PyTorch Eager Execution on 47_Sum_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 43%|████▎     | 43/100 [05:36<12:06, 12.74s/it]

Using PyTorch Eager Execution on 48_Mean_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 44%|████▍     | 44/100 [05:48<11:51, 12.71s/it]

Using PyTorch Eager Execution on 49_Max_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 45%|████▌     | 45/100 [06:03<12:11, 13.30s/it]

Using PyTorch Eager Execution on 4_Matrix_vector_multiplication_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 46%|████▌     | 46/100 [06:03<08:28,  9.41s/it]

Using PyTorch Eager Execution on 50_conv_standard_2D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 47%|████▋     | 47/100 [06:18<09:52, 11.17s/it]

Using PyTorch Eager Execution on 51_Argmax_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 48%|████▊     | 48/100 [06:32<10:21, 11.96s/it]

Using PyTorch Eager Execution on 52_Argmin_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 49%|████▉     | 49/100 [06:46<10:31, 12.39s/it]

Using PyTorch Eager Execution on 53_Min_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 54_conv_standard_3D__square_input__square_kernel


 50%|█████     | 50/100 [06:46<07:15,  8.71s/it]

[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 51%|█████     | 51/100 [06:48<05:23,  6.61s/it]

Using PyTorch Eager Execution on 55_conv_standard_2D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 52%|█████▏    | 52/100 [06:48<03:50,  4.79s/it]

Using PyTorch Eager Execution on 56_conv_standard_2D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 53%|█████▎    | 53/100 [06:51<03:25,  4.38s/it]

Using PyTorch Eager Execution on 57_conv_transposed_2D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 54%|█████▍    | 54/100 [06:52<02:24,  3.14s/it]

Using PyTorch Eager Execution on 58_conv_transposed_3D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 59_conv_standard_3D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 56%|█████▌    | 56/100 [07:00<02:57,  4.04s/it]

Using PyTorch Eager Execution on 5_Matrix_scalar_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 60_conv_standard_3D__square_input__asymmetric_kernel


 57%|█████▋    | 57/100 [07:00<02:03,  2.88s/it]

[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 58%|█████▊    | 58/100 [07:01<01:33,  2.23s/it]

Using PyTorch Eager Execution on 61_conv_transposed_3D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 59%|█████▉    | 59/100 [07:02<01:09,  1.70s/it]

Using PyTorch Eager Execution on 62_conv_standard_2D__square_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 63_conv_standard_2D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 61%|██████    | 61/100 [07:07<01:28,  2.28s/it]

Using PyTorch Eager Execution on 64_conv_transposed_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 62%|██████▏   | 62/100 [07:08<01:17,  2.04s/it]

Using PyTorch Eager Execution on 65_conv_transposed_2D__square_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 66_conv_standard_3D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 64%|██████▍   | 64/100 [07:10<00:56,  1.56s/it]

Using PyTorch Eager Execution on 67_conv_standard_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 68_conv_transposed_3D__square_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 66%|██████▌   | 66/100 [07:15<00:59,  1.74s/it]

Using PyTorch Eager Execution on 69_conv_transposed_2D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 67%|██████▋   | 67/100 [07:17<00:57,  1.75s/it]

Using PyTorch Eager Execution on 6_Matmul_with_large_K_dimension_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 70_conv_transposed_3D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 69%|██████▉   | 69/100 [07:20<00:52,  1.68s/it]

Using PyTorch Eager Execution on 71_conv_transposed_2D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 71%|███████   | 71/100 [07:20<00:29,  1.01s/it]

Using PyTorch Eager Execution on 73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 72%|███████▏  | 72/100 [07:22<00:28,  1.04s/it]

Using PyTorch Eager Execution on 74_conv_transposed_1D_dilated
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 73%|███████▎  | 73/100 [07:22<00:22,  1.21it/s]

Using PyTorch Eager Execution on 75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 74%|███████▍  | 74/100 [07:35<01:48,  4.19s/it]

Using PyTorch Eager Execution on 76_conv_standard_1D_dilated_strided__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 76%|███████▌  | 76/100 [07:36<01:02,  2.59s/it]

Using PyTorch Eager Execution on 78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 77%|███████▋  | 77/100 [07:37<00:48,  2.10s/it]

Using PyTorch Eager Execution on 79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 7_Matmul_with_small_K_dimension_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 79%|███████▉  | 79/100 [07:37<00:28,  1.36s/it]

Using PyTorch Eager Execution on 80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 81%|████████  | 81/100 [07:39<00:23,  1.25s/it]

Using PyTorch Eager Execution on 82_conv_depthwise_2D_square_input_square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 82%|████████▏ | 82/100 [07:40<00:21,  1.19s/it]

Using PyTorch Eager Execution on 83_conv_depthwise_2D_square_input_asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 83%|████████▎ | 83/100 [07:48<00:44,  2.59s/it]

Using PyTorch Eager Execution on 84_conv_depthwise_2D_asymmetric_input_square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 84%|████████▍ | 84/100 [07:49<00:35,  2.19s/it]

Using PyTorch Eager Execution on 85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 85%|████████▌ | 85/100 [07:50<00:31,  2.12s/it]

Using PyTorch Eager Execution on 86_conv_depthwise_separable_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 86%|████████▌ | 86/100 [07:57<00:46,  3.34s/it]

Using PyTorch Eager Execution on 87_conv_pointwise_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 87%|████████▋ | 87/100 [07:58<00:32,  2.53s/it]

Using PyTorch Eager Execution on 88_MinGPTNewGelu
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 88%|████████▊ | 88/100 [08:04<00:44,  3.74s/it]

Using PyTorch Eager Execution on 89_cumsum
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 89%|████████▉ | 89/100 [08:05<00:30,  2.75s/it]

Using PyTorch Eager Execution on 8_Matmul_with_irregular_shapes_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 90%|█████████ | 90/100 [08:12<00:40,  4.00s/it]

Using PyTorch Eager Execution on 90_cumprod
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 91_cumsum_reverse
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 91%|█████████ | 91/100 [08:18<00:43,  4.78s/it]

Using PyTorch Eager Execution on 92_cumsum_exclusive
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 92%|█████████▏| 92/100 [08:26<00:45,  5.64s/it]

Using PyTorch Eager Execution on 93_masked_cumsum
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 93%|█████████▎| 93/100 [08:42<01:00,  8.71s/it]

Using PyTorch Eager Execution on 94_MSELoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 95%|█████████▌| 95/100 [08:59<00:39,  7.84s/it]

Using PyTorch Eager Execution on 95_CrossEntropyLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 96%|█████████▌| 96/100 [09:16<00:42, 10.57s/it]

Using PyTorch Eager Execution on 96_HuberLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 97_ScaledDotProductAttention
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 98%|█████████▊| 98/100 [09:21<00:13,  6.68s/it]

Using PyTorch Eager Execution on 98_KLDivLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3


 99%|█████████▉| 99/100 [09:27<00:06,  6.71s/it]

Using PyTorch Eager Execution on 99_TripletMarginLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3
Using PyTorch Eager Execution on 9_Tall_skinny_matrix_multiplication_


100%|██████████| 100/100 [09:27<00:00,  5.68s/it]

[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 3



