## Training

In [None]:
import os
import sys
import torch
base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
sys.path.append(base_dir)

cpu_device = "cuda" if torch.cuda.is_available() else "cpu"
from Scheduler.scheduler import PyTorchSimRunner
npu_device = PyTorchSimRunner.setup_device().custom_device()

### Normal Backward Code

In [None]:
torch.manual_seed(0)
cpu_input = torch.randn(128, 128).to(cpu_device)
cpu_weight = torch.randn(128, 128).to(cpu_device)
cpu_target = torch.randn(128, 128).to(cpu_device)
cpu_input.requires_grad = True
cpu_weight.requires_grad = True

opt_fn = torch.matmul
cpu_out = opt_fn(cpu_input, cpu_weight)

loss_fn = torch.nn.CrossEntropyLoss()
cpu_loss = loss_fn(cpu_out, cpu_target)
cpu_loss.backward()

### PyTorchSim Backward Code

In [None]:
torch.manual_seed(0)
npu_input = torch.randn(128, 128).to(npu_device)
npu_weight = torch.randn(128, 128).to(npu_device)
npu_target = torch.randn(128, 128).to(npu_device)
npu_input.requires_grad = True
npu_weight.requires_grad = True

opt_fn = torch.compile(torch.matmul)
npu_out = opt_fn(npu_input, npu_weight)

loss_fn = torch.nn.CrossEntropyLoss()
npu_loss = loss_fn(npu_out, npu_target)
npu_loss.backward()

In [None]:
def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):
    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):
        message = f"|{name} Test Passed|"
        print("-" * len(message))
        print(message)
        print("-" * len(message))
    else:
        message = f"|{name} Test Failed|"
        print("-" * len(message))
        print(message)
        print("-" * len(message))
        print("npu out: ", npu_out.cpu())
        print("cpu out: ", cpu_out)
        exit(1)

In [None]:
test_result("MatMul Input Grad", npu_input.grad, cpu_input.grad)
test_result("MatMul Weight Grad", npu_weight.grad, cpu_weight.grad)