## 🔧 Environment Check

In [7]:
!pip install torchvision

Collecting torchvision
  Downloading torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Downloading torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchvision
Successfully installed torchvision-0.22.1


In [1]:
import torch
print(torch.cuda.is_available())

False


I have a mac :\)

## 📦 Dataset Load & Preprocessing	

In [2]:
from datasets import load_dataset
from torchvision import transforms
from transformers import AutoTokenizer
from PIL import Image

# 1️⃣  Load data
ds_train, ds_test = load_dataset(
    "CADCODER/GenCAD-Code",
    split=["train", "test"],
    num_proc=8          # 8 is usually plenty on an M1 Pro
)

# 2️⃣  Image transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def preprocess(example):
    """
    - Resize image & convert to tensor
    - Keep the CADQuery script under the key `cadquery`
    """
    return {
        "image": transform(example["image"].convert("RGB")),
        "cadquery": example["cadquery"],
    }

ds_train = ds_train.map(preprocess)
ds_test  = ds_test.map(preprocess)

# 3️⃣  Tokeniser (GPT-2)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token      # GPT-2 has no pad token by default

def tokenize(example):
    tokens = tokenizer(
        example["cadquery"],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )
    example["input_ids"] = tokens["input_ids"][0]
    example["attention_mask"] = tokens["attention_mask"][0]
    return example

# Keep only what we need: image + tokenised IDs/mask
keep_cols = ["image", "input_ids", "attention_mask"]

ds_train = ds_train.map(tokenize, remove_columns=[c for c in ds_train.column_names if c not in keep_cols])
ds_test  = ds_test.map(tokenize,  remove_columns=[c for c in ds_test.column_names  if c not in keep_cols])

# 4️⃣  Tell 🤗 Datasets to yield PyTorch tensors
ds_train.set_format(type="torch")
ds_test.set_format(type="torch")

print("Columns now:", ds_train.column_names)
print("Sample shapes — img:", ds_train[0]["image"].shape,
      "| ids:", ds_train[0]["input_ids"].shape)

Setting num_proc from 8 to 2 for the train split as it only contains 2 shards.


Generating train split:   0%|          | 0/147289 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/7355 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/8204 [00:00<?, ? examples/s]

Map:   0%|          | 0/147289 [00:00<?, ? examples/s]

Map:   0%|          | 0/7355 [00:00<?, ? examples/s]

Map:   0%|          | 0/147289 [00:00<?, ? examples/s]

Map:   0%|          | 0/7355 [00:00<?, ? examples/s]

Columns now: ['image', 'input_ids', 'attention_mask']
Sample shapes — img: torch.Size([3, 224, 224]) | ids: torch.Size([256])


In [3]:
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.8.0.dev20250629-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.8.0.dev20250628-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.2 kB)
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.8.0.dev20250627-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.2 kB)
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.8.0.dev20250626-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.2 kB)
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.8.0.dev20250625-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.2 kB)
  Downloading https://download.pytorch.or

In [4]:
import torch
print(torch.backends.mps.is_available())   # → True


True


In [8]:
import torch
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using", DEVICE)


Using mps


In [5]:
from torch.utils.data import DataLoader

BATCH_SIZE = 8              # fits into 16 GB unified memory
NUM_WORKERS = 4             # safe on M1 Pro

train_loader = DataLoader(ds_train, batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=NUM_WORKERS)
test_loader  = DataLoader(ds_test,  batch_size=BATCH_SIZE,
                          shuffle=False, num_workers=NUM_WORKERS)

## Baseline model (ResNet18 encoder + LSTM decoder)



In [6]:
import torch.nn as nn
import torchvision.models as models

class Img2Code(nn.Module):
    def __init__(self, vocab, embed=256, hidden=512):
        super().__init__()

        # ➊ image encoder (pre-trained ResNet18, last layer replaced)
        self.cnn = models.resnet18(weights="IMAGENET1K_V1")
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, embed)

        # ➋ text decoder
        self.embed = nn.Embedding(vocab, embed)
        self.lstm  = nn.LSTM(embed, hidden, batch_first=True)
        self.fc    = nn.Linear(hidden, vocab)

    def forward(self, img, seq):
        """
        img : [B,3,224,224]
        seq : [B,T]   (teacher forcing tokens)
        """
        feat = self.cnn(img)                       # [B,embed]
        emb  = self.embed(seq)                     # [B,T,embed]
        emb[:,0,:] = feat                          # inject image at <bos>
        out, _ = self.lstm(emb)
        return self.fc(out)                        # [B,T,vocab]

In [9]:
model = Img2Code(len(tokenizer)).to(DEVICE)

In [10]:
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

EPOCHS = 1        # adjust ↑ when you have more time
LR     = 1e-4

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

for epoch in range(EPOCHS):
    model.train()
    total = 0.0
    for batch in train_loader:
        imgs  = batch["image"].to(DEVICE)
        ids   = batch["input_ids"].to(DEVICE)

        # Teacher forcing: predict token t+1 from tokens ≤ t
        logits = model(imgs, ids[:,:-1])
        loss   = criterion(
            logits.reshape(-1, logits.size(-1)),
            ids[:,1:].reshape(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total += loss.item()

    print(f"Epoch {epoch+1} | loss = {total/len(train_loader):.4f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

In [None]:
@torch.inference_mode()
def generate(img, max_len=120):
    model.eval()
    img = img.unsqueeze(0).to(DEVICE)
    seq = torch.tensor([[tokenizer.bos_token_id]], device=DEVICE)

    for _ in range(max_len):
        logits = model(img, seq)
        next_id = logits[:,-1].argmax(-1, keepdim=True)
        seq = torch.cat([seq, next_id], dim=1)
        if next_id.item() == tokenizer.eos_token_id:
            break

    return tokenizer.decode(seq[0,1:].tolist(), skip_special_tokens=True)

In [None]:
from metrics.valid_syntax_rate import evaluate_syntax_rate_simple
from metrics.best_iou import get_iou_best

# ── generate for 10 test samples just to check the pipeline ──
codes_pred, codes_gt = {}, {}
for i, sample in enumerate(ds_test.shuffle(seed=42).select(range(10))):
    gen_code = generate(sample["image"].to(DEVICE))
    codes_pred[f"pred_{i}"] = gen_code
    codes_gt  [f"pred_{i}"] = sample["cadquery"]

vsr = evaluate_syntax_rate_simple(codes_pred)
print("Valid-syntax rate (10 samples):", vsr)

ious = [get_iou_best(codes_pred[k], codes_gt[k]) for k in codes_pred]
print("Avg IOU (10 samples):", sum(ious)/len(ious))