# Predicting Zen 2 micro-architecture

Zen 2 uarch was introduced 2019. The core features 19-stage pipeline with a 4-way decoder.

Setup hyperparameters for the model

In [1]:
import os

# TODO(Alex) in future revisions of the dataset this value must be embedded into the graph
num_opcodes = 21000
embedding_size = 128
hidden_size = 64
batch_size = 4
output_size = 12 # It is known that Zen 2 features 12 "ports"
num_heads = 2

learning_rate = 0.001

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"

Load the dataset
All tests were run on AMD Ryzen 5 3600.

In [3]:
import model.utils
import torch
from torch_geometric.loader import DataLoader

dataset = model.utils.BasicBlockDataset("data/x86_64/basic_blocks", "data/x86_64/ryzen3600", num_opcodes)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

Setup the model

In [4]:
from model.GraphEncoder import GATEncoder
from model.Predictor import Predictor
import torch_geometric

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

encoder = GATEncoder(num_opcodes, embedding_size, hidden_size, num_opcodes).to(device)
model = Predictor(encoder, hidden_size, output_size).to(device)
#model = torch_geometric.compile(model)

Run training loop

In [5]:
from model.model import train

checkpoint_dir = "checkpoints/ryzen3600"
checkpoint_freq = 100
num_epochs = 2000

if torch.cuda.is_available():
    torch.cuda.empty_cache()

train(model, device, loader, num_epochs, batch_size, learning_rate, checkpoint_dir, checkpoint_freq)

  0%|          | 0/4216000 [00:00<?, ?it/s]

OutOfMemoryError: Allocation on device 0 would exceed allowed memory. (out of memory)
Currently allocated     : 5.84 GiB
Requested               : 743.41 MiB
Device limit            : 7.79 GiB
Free (according to CUDA): 15.62 MiB
PyTorch limit (set by user-supplied memory fraction)
                        : 17179869184.00 GiB

Save trained model into a file

In [4]:
torch.save(model, "trained_models/ryzen3600.pt")

Model showcase

In [4]:
choice = dataset[140]
bb, m, raw = choice

print(bb.x)

input_sequence = bb.x.to(device)
edge_index = bb.edge_index.to(device)

out, _ = model(input_sequence, edge_index)
res = out.to("cpu").detach().numpy()
model.utils.print_port_pressure_table(res, raw["source"])
print(model.utils.estimate_cycles(out))
print(m)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Port  |    0   |    1   |    2   |    3   |    4   |    5   |    6   |    7   |    8   |    9   |   10   |   11   |
------------------------------------------------------------------------------------------------------
       | 0.00   | 0.00   | 2185678.00| 37210996.00| 1750582.88| 0.00   | 0.00   | 2526091.00| 2403171.00| 1974601.50| 1898305.50| 1627569.50| nopl	(%rax)
       | 0.00   | 0.00   | 771668.12| 13137992.00| 618063.88| 0.00   | 0.00   | 891871.06| 848464.25| 697149.81| 670211.81| 574634.38| testq	%r15, %r15
50348988.0
0.144
