In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

predict_ai_model_runtime_path = kagglehub.competition_download('predict-ai-model-runtime')
samihaija_tpugraphsv1_layout_data_py_path = kagglehub.utility_script_install('samihaija/tpugraphsv1-layout-data-py')
samihaija_tpugraphsv1_implicit_py_path = kagglehub.utility_script_install('samihaija/tpugraphsv1-implicit-py')
samihaija_tpugraphsv1_tile_data_py_path = kagglehub.utility_script_install('samihaija/tpugraphsv1-tile-data-py')

print('Data source import complete.')


<a target="_blank" href="https://colab.research.google.com/github/balint-kiraly/predict-ai-model-runtime.git">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [None]:
import torch
import os
print("PyTorch has version {}".format(torch.__version__))

PyTorch has version 2.6.0+cu124


In [None]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-2.6.0+cu124.html

Looking in links: https://pytorch-geometric.com/whl/torch-2.6.0+cu124.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_scatter-2.1.2%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt26cu124


In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch import nn
from torch import Tensor
from torch_geometric.nn import SAGEConv, global_mean_pool
from torch.utils.data import DataLoader, Dataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [None]:
def load_df(directory):
    splits = ["train", "valid", "test"]
    dfs = dict()

    for split in splits:
        path = os.path.join(directory, split)
        if not os.path.exists(path):
            continue

        files = os.listdir(path)
        list_df = []

        for file in files:
            d = dict(np.load(os.path.join(path,file)))
            d['file'] = file
            list_df.append(d)
        dfs[split] = pd.DataFrame.from_dict(list_df)
    return dfs

tile_xla = load_df("/kaggle/input/predict-ai-model-runtime/npz_all/npz/tile/xla/")

In [None]:
class TileDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Features
        config_feat = torch.tensor(row['config_feat'].astype(np.float32))
        node_feat = torch.tensor(row['node_feat'].astype(np.float32))

        node_opcode = torch.tensor(row['node_opcode'].astype(np.int64))
        edge_index = torch.tensor(np.swapaxes(row['edge_index'],0,1).astype(np.int64))

        # Target: Normalized Runtime
        target = (row['config_runtime'] / row['config_runtime_normalizers']).astype(np.float32)

        # Simple MinMax scaling
        target = (target - min(target)) / (max(target) - min(target))
        target = torch.tensor(target)

        return config_feat, node_feat, node_opcode, edge_index, target

In [None]:
def pairwise_ranking_loss(preds, targets, margin=0.1):
    """
    Computes pairwise ranking loss.
    If target[i] < target[j] (i is faster), we want pred[i] < pred[j] (i has lower score).
    """
    # Generate all pairs (broadcasting)
    # preds: (N, 1) -> (N, N) diffs
    # View as (N, 1) to ensure broadcasting works correctly if input is flat
    if preds.dim() == 1:
        preds = preds.view(-1, 1)

    pred_diff = preds - preds.t()

    # targets: (N, ) -> (N, N) diffs
    target_diff = targets.unsqueeze(1) - targets.unsqueeze(0)

    # Determine correct ordering
    # S_ij = 1 if i should be faster (lower runtime) than j
    # S_ij = -1 if i should be slower than j
    # S_ij = 0 if they are equal
    S = torch.sign(target_diff)

    # Compute hinge loss
    # If S_ij = 1 (target_i > target_j, i is SLOWER), we want pred_i > pred_j
    # So pred_diff (pred_i - pred_j) should be positive.
    # Loss = max(0, -S * pred_diff + margin)

    # We only care where targets are different
    mask = (S != 0)

    # S contains signs of (target_i - target_j).
    # If target_i > target_j (i is slower), S=1. We want pred_i > pred_j.
    # ideally (pred_i - pred_j) > 0.
    # if (pred_i - pred_j) < 0 (wrong order), we penalize.

    loss = torch.nn.functional.relu(margin - S[mask] * pred_diff[mask])
    return loss.mean()

In [None]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels, graph_feats, hidden_dim):
        super().__init__()

        op_embedding_dim = 32
        self.embedding = torch.nn.Embedding(120, op_embedding_dim)

        # Input channels: 140 node feats + 32 embedding
        in_channels = op_embedding_dim + 140

        self.convs = torch.nn.ModuleList()

        # Layer 1
        self.convs.append(SAGEConv(in_channels, hidden_channels[0]))

        # Subsequent layers
        for i in range(len(hidden_channels)-1):
            self.convs.append(SAGEConv(hidden_channels[i], hidden_channels[i+1]))

        last_dim = hidden_channels[-1]

        # Graph processing final layer
        self.conv_final = SAGEConv(last_dim, graph_feats)

        # Dense layers for final prediction
        # Input: graph_feats + 24 config features
        self.dense = torch.nn.Sequential(
            nn.Linear(graph_feats + 24, 128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 1)
        )

    def forward(self, x_cfg: Tensor, x_feat: Tensor, x_op: Tensor, edge_index: Tensor) -> Tensor:

        # Embed OpCodes and concatenate with Node Features
        x = torch.cat([x_feat, self.embedding(x_op)], dim=1)

        # Pass through Graph Convolutions
        for conv in self.convs:
            x = conv(x, edge_index).relu()

        # Final convolutional layer
        x = self.conv_final(x, edge_index).relu()
        # ---------------------------------------------

        # Get Graph Embedding
        x_graph = torch.mean(x, dim=0)

        # Concatenate Graph Embedding with Config Features
        x_graph_repeated = x_graph.repeat((len(x_cfg), 1))

        combined = torch.cat([x_cfg, x_graph_repeated], dim=1)

        # Predict Score
        out = self.dense(combined)

        return torch.flatten(out)

model = Model(hidden_channels=[64, 128, 64], graph_feats=128, hidden_dim=64).to(device)
print("Model corrected and re-created.")

Model corrected and re-created.


In [None]:
dataset = TileDataset(tile_xla["train"])
criterion_mae = torch.nn.L1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

epochs = 5
# Limit the memory usage -> otherwise crashes
MAX_RANKING_SAMPLES = 2000

model.train()
print("Starting training with memory optimization...")

for epoch in range(epochs):
    pbar = tqdm(range(len(dataset)))
    loss_sum = 0
    n = 0

    for i in pbar:
        cfg_ft, nd_ft, nd_op, ind, target = dataset[i]

        cfg_ft = cfg_ft.to(device)
        nd_ft = nd_ft.to(device)
        nd_op = nd_op.to(device)
        ind = ind.to(device)
        target = target.to(device)

        optimizer.zero_grad()

        # Forward pass
        out = model(cfg_ft, nd_ft, nd_op, ind)

        # If there are too many configs, pick a random subset to compare
        if len(target) > MAX_RANKING_SAMPLES:
            perm = torch.randperm(len(target))
            idx = perm[:MAX_RANKING_SAMPLES]
            loss_rank = pairwise_ranking_loss(out[idx], target[idx])
        else:
            loss_rank = pairwise_ranking_loss(out, target)

        loss_reg = criterion_mae(out, target)

        loss = loss_rank + 0.1 * loss_reg

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        loss_sum += loss.item()
        n += 1
        pbar.set_description(f'Epoch {epoch+1}/{epochs} | Loss: {(loss_sum/n):.4f}')

    scheduler.step()

Starting training with memory optimization...


Epoch 1/5 | Loss: 0.8161: 100%|██████████| 5709/5709 [01:13<00:00, 77.23it/s] 
Epoch 2/5 | Loss: 0.0617: 100%|██████████| 5709/5709 [01:13<00:00, 78.18it/s]
Epoch 3/5 | Loss: 0.0633: 100%|██████████| 5709/5709 [01:13<00:00, 78.15it/s]
Epoch 4/5 | Loss: 0.0581: 100%|██████████| 5709/5709 [01:12<00:00, 78.39it/s]
Epoch 5/5 | Loss: 0.0803: 100%|██████████| 5709/5709 [01:12<00:00, 78.38it/s]


In [None]:
val_dataset = TileDataset(tile_xla["valid"])
tile_xla_predictions = []
model.eval()

print("Validating...")
pbar = tqdm(range(len(val_dataset)))
for i in pbar:
    cfg_ft, nd_ft, nd_op, ind, target = val_dataset[i]
    cfg_ft = cfg_ft.to(device)
    nd_ft = nd_ft.to(device)
    nd_op = nd_op.to(device)
    ind = ind.to(device)

    with torch.no_grad():
        out = model(cfg_ft, nd_ft, nd_op, ind)

    # We want the indices of the configurations with the lowest predicted runtime
    # argsort sorts ascending, so [:5] gives indices of lowest 5
    tile_xla_predictions.append(np.argsort(out.cpu().numpy())[:5])

def score_tile(predictions, df):
    score = 0
    for i in range(len(df)):
        # Calculate score based on how close our best prediction was to the actual best
        predbest = min(df.iloc[i]['config_runtime'][predictions[i]])
        best = min(df.iloc[i]['config_runtime'])
        score += 2 - predbest/best
    score /= len(df)
    return score

val_score = score_tile(tile_xla_predictions, tile_xla["valid"])
print(f"Validation Score: {val_score}")

Validating...


100%|██████████| 676/676 [00:01<00:00, 372.06it/s]


Validation Score: 0.948936010479559


In [None]:
test_dataset = TileDataset(tile_xla["test"])
test_predictions = []
model.eval()

print("Running inference on test set...")
pbar = tqdm(range(len(test_dataset)))
for i in pbar:
    cfg_ft, nd_ft, nd_op, ind, target = test_dataset[i]
    cfg_ft = cfg_ft.to(device)
    nd_ft = nd_ft.to(device)
    nd_op = nd_op.to(device)
    ind = ind.to(device)

    with torch.no_grad():
        out = model(cfg_ft, nd_ft, nd_op, ind)

    test_predictions.append(np.argsort(out.cpu().numpy())[:5])

# Create Submission CSV
sub = pd.read_csv('/kaggle/input/predict-ai-model-runtime/sample_submission.csv')
for i, filename in enumerate(tile_xla["test"]['file'].values):
    id = 'tile:xla:' + filename[:-4]
    sub.loc[sub.ID == id, 'TopConfigs'] = ';'.join(test_predictions[i].astype(str))

sub.to_csv('submission.csv', index=False)
print("submission.csv created successfully.")
sub.head()

Running inference on test set...


  target = (target - min(target)) / (max(target) - min(target))
100%|██████████| 844/844 [00:02<00:00, 380.83it/s]


submission.csv created successfully.


Unnamed: 0,ID,TopConfigs
0,tile:xla:d6f5f54247bd1e58a10b9e7062c636ab,0;1;2;3;4
1,tile:xla:e3a655daa38e34ec240df959b650ac16,528;667;888;1094;396
2,tile:xla:f8c2c1a1098b2a361c26df668b286c87,84;40;204;12;189
3,tile:xla:4dd1716853ed46ee4e7d09ede1732de8,3939;1015;7320;903;8910
4,tile:xla:d0a69155b6340748c36724e4bfc34be3,576;159;236;650;151
