# Hello, PyTorchSim!

In [1]:
import torch
import os
import sys
base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
sys.path.append(base_dir)

## One Touch Simulation
### Normal Matmul Code

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(0)
input = torch.randn(128, 128).to(device)
weight = torch.randn(128, 128).to(device)

opt_fn = torch.compile(torch.matmul)
cpu_out = opt_fn(input, weight)

### PyTorchSim Matmul Code

In [3]:
from Scheduler.scheduler import PyTorchSimRunner
device = PyTorchSimRunner.setup_device().custom_device()

torch.manual_seed(0)
input = torch.randn(128, 128).to(device)
weight = torch.randn(128, 128).to(device)

opt_fn = torch.compile(torch.matmul)
npu_out = opt_fn(input, weight)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...
Building extension module npu...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module npu...


ninja: no work to do.
Wrapper Codegen Path = /tmp/torchinductor_root/ro/croutbd6yxrzgdstfcplx7yrpn2do5frwhyx2md5r7rvrubdhdgd.py
[Gem5] Gem5 is running... 
[Spike] Running Spike simulator
[TOGSim] TOGSim is running..  
[TOGSim] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0"


In [4]:
def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):
    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):
        message = f"|{name} Test Passed|"
        print("-" * len(message))
        print(message)
        print("-" * len(message))
    else:
        message = f"|{name} Test Failed|"
        print("-" * len(message))
        print(message)
        print("-" * len(message))
        print("npu out: ", npu_out.cpu())
        print("cpu out: ", cpu_out)
        exit(1)

In [5]:
test_result("MatMul", npu_out, cpu_out)

--------------------
|MatMul Test Passed|
--------------------


## Training

In [6]:
# from Scheduler.scheduler import PyTorchSimRunner
# npu_device = PyTorchSimRunner.setup_device().custom_device()

### Normal Backward Code

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(0)
cpu_input = torch.randn(128, 128).to(device)
cpu_weight = torch.randn(128, 128).to(device)
cpu_target = torch.randn(128, 128).to(device)
cpu_input.requires_grad = True
cpu_weight.requires_grad = True

opt_fn = torch.compile(torch.matmul)
cpu_out = opt_fn(cpu_input, cpu_weight)

loss_fn = torch.nn.CrossEntropyLoss()
cpu_loss = loss_fn(cpu_out, cpu_target)
cpu_loss.backward()

### PyTorchSim Backward Code

In [7]:
from Scheduler.scheduler import PyTorchSimRunner
npu_device = PyTorchSimRunner.setup_device().custom_device()
torch.manual_seed(0)
npu_input = torch.randn(128, 128).to(npu_device)
npu_weight = torch.randn(128, 128).to(npu_device)
npu_target = torch.randn(128, 128).to(npu_device)
npu_input.requires_grad = True
npu_weight.requires_grad = True

opt_fn = torch.compile(torch.matmul)
npu_out = opt_fn(npu_input, npu_weight)

loss_fn = torch.nn.CrossEntropyLoss()
npu_loss = loss_fn(npu_out, npu_target)
npu_loss.backward()

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


Wrapper Codegen Path = /tmp/torchinductor_root/5i/c5isqyualxbaqsmuhsux7oubvkypfmh4kvamqvgref6z3ypnrpw5.py
[Gem5] Gem5 is running... 
[Spike] Running Spike simulator
[TOGSim] TOGSim is running..  
[TOGSim] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/19"


RuntimeError: 0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()) INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp":1423, please report a bug to PyTorch. 

In [3]:
test_result("MatMul Input Grad", npu_input.grad, cpu_input.grad)
test_result("MatMul Weight Grad", npu_weight.grad, cpu_weight.grad)

NameError: name 'test_result' is not defined

## Mapping

Default mapping is based on heuristic.

In [6]:
import torch
from Scheduler.scheduler import PyTorchSimRunner
device = PyTorchSimRunner.setup_device().custom_device()

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


Wrapper Codegen Path = /tmp/torchinductor_root/5z/c5z4ur2k2svn2gaawn776ev3t6gsa7esgu36la63523cqpbbt56d.py
[Gem5] Gem5 is running..  
[Spike] Running Spike simulator
[TOGSim] TOGSim is running.   
[TOGSim] Simulation of "/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0"


In [7]:
!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0 | grep "Total execution cycle"

[2025-11-30 18:53:14.002] [info] Total execution cycle: 47158


### Manual Mapping
User can set tile size manually.

In [8]:
torch._dynamo.reset()

os.environ['TORCHSIM_MANUAL_TILE_SIZE']="1"
os.environ['TORCHSIM_TILE_M']="512"
os.environ['TORCHSIM_TILE_N']="512"
os.environ['TORCHSIM_TILE_K']="512"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

Wrapper Codegen Path = /tmp/torchinductor_root/mv/cmv6cp7oo3wwndv76iv3sib7r74tnbvodfwxi3rw33k7grlh3h4h.py
[Gem5] Gem5 is running.   
[Spike] Running Spike simulator
[TOGSim] TOGSim is running... 
[TOGSim] Simulation of "/tmp/torchinductor/tmp/75hiq5mugpq/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0"


In [9]:
!cat /tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0 | grep "Total execution cycle"

[2025-11-30 18:54:00.878] [info] Total execution cycle: 53704


### Autotune

In [10]:
torch._dynamo.reset()
os.environ['TORCHSIM_MANUAL_TILE_SIZE']="0"
os.environ['AUTOTUNE_TEMPLATE']="1"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

[Auto-tune] Trying tile size: [1024, 1024, 256, 128, 1024, 256]
[Auto-tune] Trying tile size: [256, 1024, 1024, 128, 1024, 1024]
[Auto-tune] Trying tile size: [1024, 256, 1024, 128, 256, 1024]
[Auto-tune] Trying tile size: [1024, 1024, 128, 128, 1024, 128]
[TOGSim] Simulation of "/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/0"
[TOGSim] Simulation of "/tmp/torchinductor/tmp/7j33rcic2qn/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/7j33rcic2qn/togsim_result/0"
[TOGSim] Simulation of "/tmp/torchinductor/tmp/vsaamplubl5/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/vsaamplubl5/togsim_result/0"
[Auto-tune] Optimal tile size: [1024, 1024, 128, 128, 1024, 128], cycles: 46423
Wrapper Codegen Path = /tmp/torchinductor_root/3b/c3bebp4b4rp73grbvhbaq4xdxny7f5m7fgqkgpflp2cjn3x5uugr.py
[Gem5] Gem5 is running..  
[Spike] Running Spike simulator
[TOGSim] TOGSim is running.   
[TOGSim] Simulation of "/tmp/torchinducto

In [11]:
!cat /tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1 | grep "Total execution cycle"

[2025-11-30 18:54:53.051] [info] Total execution cycle: 46422


## Execution Mode
### Functional & Timing mode (Default)

In [17]:
torch._dynamo.reset()
os.environ['AUTOTUNE_TEMPLATE']="0"
os.environ['TORCHSIM_FUNCTIONAL_MODE']="1"
os.environ['TORCHSIM_TIMING_MODE']="1"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

Wrapper Codegen Path = /tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py
[Gem5] Gem5 is running..  
[Spike] Running Spike simulator
[TOGSim] TOGSim is running.   
[TOGSim] Simulation of "/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/4"


### Functional only mode

In [18]:
os.environ['TORCHSIM_FUNCTIONAL_MODE']="1"
os.environ['TORCHSIM_TIMING_MODE']="0"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


[Spike] Running Spike simulator


### Timing only mode

In [23]:
os.environ['TORCHSIM_FUNCTIONAL_MODE']="0"
os.environ['TORCHSIM_TIMING_MODE']="1"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

FileNotFoundError: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'

## TOGSim Configuration
### Single Core

In [22]:
os.environ['TORCHSIM_CONFIG']="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

FileNotFoundError: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'

In [25]:
!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/11 | grep "Total execution cycle"

[2025-11-30 18:32:01.843] [info] Total execution cycle: 47126


### Multi-Core

In [27]:
os.environ['TORCHSIM_CONFIG']="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


[Spike] Running Spike simulator
[TOGSim] TOGSim is running.   
[TOGSim] Simulation of "/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12"


In [28]:
!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12 | grep "Total execution cycle"

[2025-11-30 18:34:48.969] [info] Total execution cycle: 40736


## TOGSim log level
### log level info

In [21]:
os.environ['TORCHSIM_DUMP_PATH']="/workspace/PyTorchSim"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

FileNotFoundError: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'

### log level trace

In [30]:
os.environ['BACKENDSIM_DEBUG_LEVEL']="trace"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

opt_fn = torch.compile(dynamic=False)(torch.matmul)
npu_out = opt_fn(input, weight)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


[Spike] Running Spike simulator
[TOGSim] TOGSim is running.   
[TOGSim] Simulation of "/workspace/PyTorchSim/tmp/4q4qv6gbpia/tile_graph.onnx" is stored to "/workspace/PyTorchSim/tmp/4q4qv6gbpia/togsim_result/1"


## Scheduler

In [None]:
import torch
from torchvision.models import resnet18
from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_BACKEND_CONFIG

scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=CONFIG_TORCHSIM_BACKEND_CONFIG)
device = scheduler.execution_engine.module.custom_device()

model = resnet18().eval()
input = torch.randn(1, 3, 224, 224).to(device=device)
opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))

SchedulerDNNModel.register_model("resnet18", opt_fn)
request = Request("resnet18", [input], [], request_queue_idx=0)
scheduler.add_request(request, request_time=0)

# Run scheduler
while not scheduler.is_finished():
    with torch.no_grad():
        scheduler.schedule()

print("ResNet18 Simulation Done")

## Load Generator

In [3]:
import os
import torch
from torchvision.models import resnet18

from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')

lambda_requests = 10
max_time = 30

target_model1 = resnet18().eval()

# Init scheduler
scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
# Register compiled model
opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
SchedulerDNNModel.register_model("resnet18", opt_model1)

# Generate time stamp
for request_time in poisson_request_generator(lambda_requests, max_time):
    # Init input data
    model_input1 = torch.randn(1, 3, 224, 224)

    # Init request
    new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)

    # Add request to scheduler
    print("[Reqest] Resnet18 request time: ", request_time, flush=True)
    scheduler.add_request(new_request1, request_time=request_time)

# Run scheduler
while not scheduler.is_finished():
    scheduler.schedule()

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


[2025-11-30 13:05:13.597] [info] [LoadConfig] Success to open "/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 0: Partition 0
[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 1: Partition 0
[2025-11-30 13:05:13.597] [info] [Config/Core] Core 0: 700 MHz, Systolic array per core: 1
[2025-11-30 13:05:13.597] [info] [Config/Core] Core 1: 700 MHz, Systolic array per core: 1
[2025-11-30 13:05:13.597] [info] [Config/DRAM] Ramulator2 config: /root/workspace/PyTorchSim/PyTorchSimBackend/configs/../configs/ramulator2_configs/HBM2.yaml
[2025-11-30 13:05:13.597] [info] [Config/DRAM] DRAM Bandwidth 716 GB/s, Freq: 700 MHz, Channels: 32, Request_size: 32B
[2025-11-30 13:05:13.597] [info] [Config/L2Cache] No L2 cache
[2025-11-30 13:05:13.673] [info] [Config/Interconnect] Interconnect freq: 20000 MHz
[2025-11-30 13:05:13.673] [info] [Config/Interconnect] SimpleInerconnect selected
[0] BackendSim> [R

KeyboardInterrupt: 

## Compiler Optimization
### GeMM + ReLU fusion

In [9]:
input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

def gemm_relu(a, b):
    return torch.relu(torch.matmul(a, b))

opt_fn = torch.compile(dynamic=False)(gemm_relu)
out = opt_fn(input, weight)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


Wrapper Codegen Path = /tmp/torchinductor_root/vr/cvrlybtkuzkk6pmnlfxu7o55375z24tajmiow6mszaen5t4ra6zo.py
[Gem5] Gem5 is running.   
[Spike] Running Spike simulator
[TOGSim] TOGSim is running.   
[TOGSim] Simulation of "/tmp/torchinductor/tmp/5o2xythi5z3/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/5o2xythi5z3/togsim_result/0"


In [10]:
!cat /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0 | grep "Total execution cycle"

cat: /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0: No such file or directory


### Disable fusion

In [1]:
os.environ['TORCHSIM_COMPILER_OPTIMIZATION']="none"

input = torch.randn(1024, 1024).to(device=device)
weight = torch.randn(1024, 1024).to(device=device)

def gemm_relu(a, b):
    return torch.relu(torch.matmul(a, b))

opt_fn = torch.compile(dynamic=False)(gemm_relu)
out = opt_fn(input, weight)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...
Building extension module npu...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module npu...


ninja: no work to do.
Wrapper Codegen Path = /tmp/torchinductor_root/tl/ctlqjsvukam6d4kteerml7exwbt4paw7cjtjbxcwdlsd7e4koriq.py
[Gem5] Gem5 is running... 
[Gem5] Gem5 is running..  
[Spike] Running Spike simulator
[TOGSim] TOGSim is running.   
[TOGSim] Simulation of "/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0"
[Spike] Running Spike simulator
[TOGSim] TOGSim is running..  
[TOGSim] Simulation of "/tmp/torchinductor/tmp/37dfo4nczcq/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/37dfo4nczcq/togsim_result/0"


In [2]:
!cat /tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2 | grep "Total execution cycle"
!cat /tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0 | grep "Total execution cycle"

[2025-11-30 12:52:49.376] [info] Total execution cycle: 47164
[2025-11-30 12:52:52.444] [info] Total execution cycle: 58510


### Single kernel mode (TODO: remove it?)

In [2]:
import torch
from torchvision.models import resnet18
from Scheduler.scheduler import PyTorchSimRunner
device = PyTorchSimRunner.setup_device().custom_device()

input = torch.randn(1, 3, 224, 224).to(device=device)
model = resnet18().to(device=device)

opt_fn = torch.compile(dynamic=False)(model)
npu_out = opt_fn(input)

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module npu, skipping build step...
Loading extension module npu...


KeyboardInterrupt: 