In [6]:
import os
import json

import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset

import tqdm


In [7]:
# This is from the "eval.py" file.
def set_seed(seed: int):
    torch.manual_seed(seed)
    # NOTE: this only sets on current cuda device
    torch.cuda.manual_seed(seed)

    
def load_original_model_and_inputs(
    model_original_src: str, context: dict
) -> tuple[nn.Module, callable, callable]:
    """
    Load class from original NN.module pytorch code
    this is pytorch reference and we feed that to model to see if there will be any improvement
    """

    try:
        compile(model_original_src, "<string>", "exec")
    except SyntaxError as e:
        print(f"Syntax Error in original code {e}")
        return None

    try:
        exec(model_original_src, context)  # expose to current namespace
    except Exception as e:
        print(f"Error in executing original code {e}")
        return None

    # these should be defined in the original model code and present in the context
    get_init_inputs_fn = context.get("get_init_inputs")
    get_inputs_fn = context.get("get_inputs")
    Model = context.get("Model")
    return (Model, get_init_inputs_fn, get_inputs_fn)

def time_execution_with_cuda_event(
    kernel_fn: callable,
    *args,
    num_warmup: int = 3,
    num_trials: int = 10,
    verbose: bool = True,
    device: torch.device = None,
) -> list[float]:
    """
    Time a CUDA kernel function over multiple trials using torch.cuda.Event

    Args:
        kernel_fn: Function to time
        *args: Arguments to pass to kernel_fn
        num_trials: Number of timing trials to run
        verbose: Whether to print per-trial timing info
        device: CUDA device to use, if None, use current device

    Returns:
        List of elapsed times in milliseconds
    """
    if device is None:
        if verbose:
            print(f"Using current device: {torch.cuda.current_device()}")
        device = torch.cuda.current_device()

    # Warm ups
    for _ in range(num_warmup):
        kernel_fn(*args)
        torch.cuda.synchronize(device=device)

    print(
        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
    )
    elapsed_times = []

    # Actual trials
    for trial in range(num_trials):
        # create event marker default is not interprocess
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        start_event.record()
        kernel_fn(*args)
        end_event.record()

        # Synchronize to ensure the events have completed
        torch.cuda.synchronize(device=device)

        # Calculate the elapsed time in milliseconds
        elapsed_time_ms = start_event.elapsed_time(end_event)
        if verbose:
            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
        elapsed_times.append(elapsed_time_ms)

    return elapsed_times

def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> dict:
    """Get timing statistics from a list of elapsed times.

    Args:
        elapsed_times: List of elapsed times in milliseconds
        device: CUDA device, record device info
    Returns:
        Dict containing mean, std, min, max and num_trials
        all timing are in ms
    """

    stats = {
        "mean": float(f"{np.mean(elapsed_times):.3g}"),
        "std": float(f"{np.std(elapsed_times):.3g}"),
        "min": float(f"{np.min(elapsed_times):.3g}"),
        "max": float(f"{np.max(elapsed_times):.3g}"),
        "num_trials": len(elapsed_times),
    }

    if device:
        stats["hardware"] = torch.cuda.get_device_name(device=device)
        stats["device"] = str(device)  # for debugging

    return stats

In [8]:
# From the original file "generate_baseline_time.py"
def measure_program_time(
        ref_arch_name: str,
        ref_arch_src: str, 
        num_trials: int = 100,
        use_torch_compile: bool = False,
        torch_compile_backend: str="inductor", 
        torch_compile_options: str="default",
        device: torch.device="cuda:0",
        verbose: bool = False,
) -> dict:
    """
    Measure the time of a KernelBench reference architecture
    """
    context = {}
    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
        ref_arch_src, context
    )
    try:
        with torch.no_grad():
            torch.cuda.synchronize(device=device)
            set_seed(42)
            inputs = get_inputs()
            set_seed(42)
            init_inputs = get_init_inputs()
            inputs = [
                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
                for x in inputs
            ]
            init_inputs = [
                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
                for x in init_inputs
            ]

            # Initialize PyTorch model, use this for eager mode execution
            model = Model(*init_inputs)
            
            if use_torch_compile:
                print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
                model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
            else:
                print(f"Using PyTorch Eager Execution on {ref_arch_name}")
            
            model = model.cuda(device=device)
            torch.cuda.synchronize(device=device)
            elapsed_times = time_execution_with_cuda_event(
                model, *inputs, num_trials=num_trials, verbose=verbose, device=device
            )
            runtime_stats = get_timing_stats(elapsed_times, device=device)

            if verbose:
                print(f"{ref_arch_name} {runtime_stats}")
            
            return runtime_stats
    except Exception as e:
        print(f"[Eval] Error in Measuring Performance: {e}")

In [None]:
#Additional Functions (eval.py and others?)
def graceful_eval_cleanup(curr_context: dict, device: torch.device):
    """
    Clean up env, gpu cache, and compiled CUDA extensions after evaluation
    """  # delete ran-specific function definitions before next eval run
    del curr_context
    # Clear CUDA cache and reset GPU state
    with torch.cuda.device(device):
        torch.cuda.empty_cache()

        # does this help?
        torch.cuda.reset_peak_memory_stats(device=device)

        torch.cuda.synchronize(
            device=device
        )  # Wait for all CUDA operations to complete

In [10]:
TIMING_DIR = "./timings"
ds = load_dataset("ai-nikolai/KernelBench")

#SOME INIT PARAMS
file_name: str="baseline_time.json"
num_trials: int= 100

use_torch_compile: bool = False
torch_compile_backend: str="inductor"
torch_compile_options: str="default"

device = torch.device("cuda:0")
json_results = {}

#
level=1
level_1 = ds[f"level_{level}"]
num_problems = len(level_1)

json_results[f"level_{level}"]={}

for sample in tqdm.tqdm(level_1):
    ref_arch_src, ref_arch_name  = sample["code"], sample["name"]
    runtime_stats = measure_program_time(
        ref_arch_name=ref_arch_name,
        ref_arch_src=ref_arch_src,
        use_torch_compile=use_torch_compile,
        torch_compile_backend=torch_compile_backend,
        torch_compile_options=torch_compile_options,
        device=device,
        verbose=False, # do not print 
        num_trials=num_trials,
    )
    json_results[f"level_{level}"][ref_arch_name] = runtime_stats

save_path = os.path.join(TIMING_DIR, file_name)
os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, "w") as f:
    json.dump(json_results, f)


  0%|          | 0/100 [00:00<?, ?it/s]

Using PyTorch Eager Execution on 100_HingeLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  1%|          | 1/100 [00:13<21:27, 13.01s/it]

Using PyTorch Eager Execution on 10_3D_tensor_matrix_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  2%|▏         | 2/100 [00:13<09:28,  5.80s/it]

Using PyTorch Eager Execution on 11_4D_tensor_matrix_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  3%|▎         | 3/100 [00:17<08:11,  5.07s/it]

Using PyTorch Eager Execution on 12_Matmul_with_diagonal_matrices_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  4%|▍         | 4/100 [00:18<05:25,  3.39s/it]

Using PyTorch Eager Execution on 13_Matmul_for_symmetric_matrices
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  5%|▌         | 5/100 [00:19<04:02,  2.55s/it]

Using PyTorch Eager Execution on 14_Matmul_for_upper_triangular_matrices
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  6%|▌         | 6/100 [00:20<03:06,  1.98s/it]

Using PyTorch Eager Execution on 15_Matmul_for_lower_triangular_matrices
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  7%|▋         | 7/100 [00:21<02:34,  1.66s/it]

Using PyTorch Eager Execution on 16_Matmul_with_transposed_A
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  8%|▊         | 8/100 [00:22<02:14,  1.46s/it]

Using PyTorch Eager Execution on 17_Matmul_with_transposed_B
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


  9%|▉         | 9/100 [00:23<02:04,  1.36s/it]

Using PyTorch Eager Execution on 18_Matmul_with_transposed_both
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 10%|█         | 10/100 [00:24<01:52,  1.25s/it]

Using PyTorch Eager Execution on 19_ReLU
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 11%|█         | 11/100 [00:37<07:11,  4.85s/it]

Using PyTorch Eager Execution on 1_Square_matrix_multiplication_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 12%|█▏        | 12/100 [00:38<05:18,  3.62s/it]

Using PyTorch Eager Execution on 20_LeakyReLU
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 13%|█▎        | 13/100 [00:51<09:20,  6.44s/it]

Using PyTorch Eager Execution on 21_Sigmoid
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 14%|█▍        | 14/100 [01:04<12:03,  8.42s/it]

Using PyTorch Eager Execution on 22_Tanh
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 15%|█▌        | 15/100 [01:17<13:42,  9.67s/it]

Using PyTorch Eager Execution on 23_Softmax
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 16%|█▌        | 16/100 [01:31<15:28, 11.06s/it]

Using PyTorch Eager Execution on 24_LogSoftmax
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 17%|█▋        | 17/100 [01:46<16:44, 12.11s/it]

Using PyTorch Eager Execution on 25_Swish
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 18%|█▊        | 18/100 [02:01<18:07, 13.26s/it]

Using PyTorch Eager Execution on 26_GELU_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 19%|█▉        | 19/100 [02:14<17:35, 13.03s/it]

Using PyTorch Eager Execution on 27_SELU_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 20%|██        | 20/100 [02:27<17:13, 12.92s/it]

Using PyTorch Eager Execution on 28_HardSigmoid
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 21%|██        | 21/100 [02:39<16:48, 12.77s/it]

Using PyTorch Eager Execution on 29_Softplus
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 22%|██▏       | 22/100 [02:52<16:44, 12.87s/it]

Using PyTorch Eager Execution on 2_Standard_matrix_multiplication_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 23%|██▎       | 23/100 [02:53<11:56,  9.30s/it]

Using PyTorch Eager Execution on 30_Softsign
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 24%|██▍       | 24/100 [03:12<15:19, 12.09s/it]

Using PyTorch Eager Execution on 31_ELU
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 25%|██▌       | 25/100 [03:24<15:09, 12.12s/it]

Using PyTorch Eager Execution on 32_HardTanh
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 26%|██▌       | 26/100 [03:36<15:05, 12.23s/it]

Using PyTorch Eager Execution on 33_BatchNorm
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 27%|██▋       | 27/100 [03:46<13:46, 11.32s/it]

Using PyTorch Eager Execution on 34_InstanceNorm
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 28%|██▊       | 28/100 [04:02<15:24, 12.84s/it]

Using PyTorch Eager Execution on 35_GroupNorm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 29%|██▉       | 29/100 [04:18<16:24, 13.87s/it]

Using PyTorch Eager Execution on 36_RMSNorm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 30%|███       | 30/100 [04:38<18:11, 15.59s/it]

Using PyTorch Eager Execution on 37_FrobeniusNorm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 31%|███       | 31/100 [04:54<18:08, 15.78s/it]

Using PyTorch Eager Execution on 38_L1Norm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 32%|███▏      | 32/100 [05:16<20:06, 17.74s/it]

Using PyTorch Eager Execution on 39_L2Norm_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 33%|███▎      | 33/100 [05:35<19:58, 17.89s/it]

Using PyTorch Eager Execution on 3_Batched_matrix_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 34%|███▍      | 34/100 [05:38<14:53, 13.54s/it]

Using PyTorch Eager Execution on 40_LayerNorm
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 35%|███▌      | 35/100 [05:39<10:42,  9.88s/it]

Using PyTorch Eager Execution on 41_Max_Pooling_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 36%|███▌      | 36/100 [05:47<09:48,  9.20s/it]

Using PyTorch Eager Execution on 42_Max_Pooling_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 37%|███▋      | 37/100 [05:52<08:24,  8.01s/it]

Using PyTorch Eager Execution on 43_Max_Pooling_3D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 38%|███▊      | 38/100 [06:00<08:05,  7.84s/it]

Using PyTorch Eager Execution on 44_Average_Pooling_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 39%|███▉      | 39/100 [06:05<07:12,  7.09s/it]

Using PyTorch Eager Execution on 45_Average_Pooling_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 40%|████      | 40/100 [06:36<14:11, 14.20s/it]

Using PyTorch Eager Execution on 46_Average_Pooling_3D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 41%|████      | 41/100 [06:51<14:08, 14.38s/it]

Using PyTorch Eager Execution on 47_Sum_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 42%|████▏     | 42/100 [07:06<14:03, 14.55s/it]

Using PyTorch Eager Execution on 48_Mean_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 43%|████▎     | 43/100 [07:21<14:10, 14.93s/it]

Using PyTorch Eager Execution on 49_Max_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 44%|████▍     | 44/100 [07:36<13:54, 14.89s/it]

Using PyTorch Eager Execution on 4_Matrix_vector_multiplication_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 45%|████▌     | 45/100 [07:53<14:03, 15.34s/it]

Using PyTorch Eager Execution on 50_conv_standard_2D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 46%|████▌     | 46/100 [07:54<10:05, 11.22s/it]

Using PyTorch Eager Execution on 51_Argmax_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 47%|████▋     | 47/100 [08:10<11:04, 12.54s/it]

Using PyTorch Eager Execution on 52_Argmin_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 48%|████▊     | 48/100 [08:25<11:34, 13.36s/it]

Using PyTorch Eager Execution on 53_Min_reduction_over_a_dimension
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 49%|████▉     | 49/100 [08:40<11:50, 13.93s/it]

Using PyTorch Eager Execution on 54_conv_standard_3D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 50%|█████     | 50/100 [08:41<08:18,  9.96s/it]

Using PyTorch Eager Execution on 55_conv_standard_2D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 51%|█████     | 51/100 [08:45<06:34,  8.05s/it]

Using PyTorch Eager Execution on 56_conv_standard_2D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 52%|█████▏    | 52/100 [08:46<04:57,  6.20s/it]

Using PyTorch Eager Execution on 57_conv_transposed_2D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 53%|█████▎    | 53/100 [08:54<05:04,  6.48s/it]

Using PyTorch Eager Execution on 58_conv_transposed_3D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 54%|█████▍    | 54/100 [08:56<03:55,  5.12s/it]

Using PyTorch Eager Execution on 59_conv_standard_3D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 55%|█████▌    | 55/100 [09:04<04:30,  6.02s/it]

Using PyTorch Eager Execution on 5_Matrix_scalar_multiplication
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 56%|█████▌    | 56/100 [09:12<04:52,  6.65s/it]

Using PyTorch Eager Execution on 60_conv_standard_3D__square_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 57%|█████▋    | 57/100 [09:13<03:33,  4.97s/it]

Using PyTorch Eager Execution on 61_conv_transposed_3D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 58%|█████▊    | 58/100 [09:15<02:52,  4.12s/it]

Using PyTorch Eager Execution on 62_conv_standard_2D__square_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 59%|█████▉    | 59/100 [09:16<02:15,  3.30s/it]

Using PyTorch Eager Execution on 63_conv_standard_2D__square_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 60%|██████    | 60/100 [09:22<02:43,  4.10s/it]

Using PyTorch Eager Execution on 64_conv_transposed_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 61%|██████    | 61/100 [09:28<03:01,  4.66s/it]

Using PyTorch Eager Execution on 65_conv_transposed_2D__square_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 62%|██████▏   | 62/100 [09:30<02:25,  3.83s/it]

Using PyTorch Eager Execution on 66_conv_standard_3D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 63%|██████▎   | 63/100 [09:31<01:44,  2.84s/it]

Using PyTorch Eager Execution on 67_conv_standard_1D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 64%|██████▍   | 64/100 [09:33<01:41,  2.81s/it]

Using PyTorch Eager Execution on 68_conv_transposed_3D__square_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 65%|██████▌   | 65/100 [10:13<08:07, 13.92s/it]

Using PyTorch Eager Execution on 69_conv_transposed_2D__asymmetric_input__asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 66%|██████▌   | 66/100 [10:16<05:55, 10.46s/it]

Using PyTorch Eager Execution on 6_Matmul_with_large_K_dimension_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 67%|██████▋   | 67/100 [10:18<04:25,  8.04s/it]

Using PyTorch Eager Execution on 70_conv_transposed_3D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 68%|██████▊   | 68/100 [10:25<04:06,  7.70s/it]

Using PyTorch Eager Execution on 71_conv_transposed_2D__asymmetric_input__square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 69%|██████▉   | 69/100 [10:27<03:02,  5.89s/it]

Using PyTorch Eager Execution on 72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 70%|███████   | 70/100 [10:27<02:08,  4.29s/it]

Using PyTorch Eager Execution on 73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 71%|███████   | 71/100 [10:29<01:41,  3.50s/it]

Using PyTorch Eager Execution on 74_conv_transposed_1D_dilated
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 72%|███████▏  | 72/100 [10:31<01:24,  3.03s/it]

Using PyTorch Eager Execution on 75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 73%|███████▎  | 73/100 [10:32<01:09,  2.56s/it]

Using PyTorch Eager Execution on 76_conv_standard_1D_dilated_strided__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 74%|███████▍  | 74/100 [10:50<03:08,  7.25s/it]

Using PyTorch Eager Execution on 77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 75%|███████▌  | 75/100 [10:51<02:11,  5.25s/it]

Using PyTorch Eager Execution on 78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 76%|███████▌  | 76/100 [10:54<01:49,  4.55s/it]

Using PyTorch Eager Execution on 79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 77%|███████▋  | 77/100 [10:55<01:22,  3.60s/it]

Using PyTorch Eager Execution on 7_Matmul_with_small_K_dimension_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 78%|███████▊  | 78/100 [10:56<01:03,  2.87s/it]

Using PyTorch Eager Execution on 80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 79%|███████▉  | 79/100 [10:58<00:51,  2.47s/it]

Using PyTorch Eager Execution on 81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 80%|████████  | 80/100 [10:59<00:37,  1.90s/it]

Using PyTorch Eager Execution on 82_conv_depthwise_2D_square_input_square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 81%|████████  | 81/100 [11:01<00:41,  2.16s/it]

Using PyTorch Eager Execution on 83_conv_depthwise_2D_square_input_asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 82%|████████▏ | 82/100 [11:03<00:34,  1.94s/it]

Using PyTorch Eager Execution on 84_conv_depthwise_2D_asymmetric_input_square_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 83%|████████▎ | 83/100 [11:13<01:14,  4.37s/it]

Using PyTorch Eager Execution on 85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 84%|████████▍ | 84/100 [11:14<00:56,  3.51s/it]

Using PyTorch Eager Execution on 86_conv_depthwise_separable_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 85%|████████▌ | 85/100 [11:18<00:52,  3.49s/it]

Using PyTorch Eager Execution on 87_conv_pointwise_2D
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 86%|████████▌ | 86/100 [11:28<01:17,  5.54s/it]

Using PyTorch Eager Execution on 88_MinGPTNewGelu
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 87%|████████▋ | 87/100 [11:30<00:56,  4.31s/it]

Using PyTorch Eager Execution on 89_cumsum
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 88%|████████▊ | 88/100 [11:39<01:10,  5.86s/it]

Using PyTorch Eager Execution on 8_Matmul_with_irregular_shapes_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 89%|████████▉ | 89/100 [11:41<00:50,  4.62s/it]

Using PyTorch Eager Execution on 90_cumprod
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 90%|█████████ | 90/100 [11:50<01:00,  6.06s/it]

Using PyTorch Eager Execution on 91_cumsum_reverse
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 91%|█████████ | 91/100 [12:02<01:10,  7.80s/it]

Using PyTorch Eager Execution on 92_cumsum_exclusive
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 92%|█████████▏| 92/100 [12:13<01:10,  8.80s/it]

Using PyTorch Eager Execution on 93_masked_cumsum
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 93%|█████████▎| 93/100 [12:33<01:25, 12.21s/it]

Using PyTorch Eager Execution on 94_MSELoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 94%|█████████▍| 94/100 [12:53<01:26, 14.49s/it]

Using PyTorch Eager Execution on 95_CrossEntropyLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 95%|█████████▌| 95/100 [12:54<00:52, 10.50s/it]

Using PyTorch Eager Execution on 96_HuberLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 96%|█████████▌| 96/100 [13:13<00:51, 12.81s/it]

Using PyTorch Eager Execution on 97_ScaledDotProductAttention
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 97%|█████████▋| 97/100 [13:16<00:30, 10.16s/it]

Using PyTorch Eager Execution on 98_KLDivLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 98%|█████████▊| 98/100 [13:23<00:18,  9.13s/it]

Using PyTorch Eager Execution on 99_TripletMarginLoss
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


 99%|█████████▉| 99/100 [13:31<00:08,  8.67s/it]

Using PyTorch Eager Execution on 9_Tall_skinny_matrix_multiplication_
[Profiling] Using device: cuda:0 NVIDIA A40, warm up 3, trials 100


100%|██████████| 100/100 [13:32<00:00,  8.12s/it]
