In [1]:
import torch
import torch.nn.functional as F

from mscgpt.data_handler import DataHandler
from mscgpt.tokenizer import Tokenizer
from mscgpt.gpt import MicroSCGPT, GeneExpressionRegressor

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32
CONTEXT_SIZE = 128
LEARNING_RATE = 1e-4
N_HEADS = 8
N_LAYERS = 12
N_EMBED = 256
N_TRAIN_STEPS = 5000
INTERVAL_EVAL = 100
N_EVAL = 200

In [3]:
tk = Tokenizer()
dh = DataHandler(BATCH_SIZE, CONTEXT_SIZE, tk.pad_token, device=DEVICE)

tokenized_data = tk.load_pretraining_dataset("tabmuris_A")
dh.load_dataset(tokenized_data)

> Tokenizer: 2000 genes successfully loaded.
> Data Handler: Dataset successfully loaded.


In [4]:
def gene_expression_loss(true_counts, estimate):
    return F.mse_loss(estimate.type(torch.float), true_counts.type(torch.float))

@torch.no_grad()
def estimate_loss(model_gpt, model_gexpr, eval_iters):
  out = {}
  model_gpt.eval()
  model_gexpr.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      x_gid, x_bin, x_cnt = dh.get_batch(split)
      x_emb = model_gpt(x_gid, x_bin)
      estimate = model_gexpr(x_emb)
      losses[k] = gene_expression_loss(x_cnt, estimate).item()
    out[split] = losses.mean()
  model_gpt.train()
  model_gexpr.train()
  return out

In [5]:
gpt = MicroSCGPT(CONTEXT_SIZE, tk.bins_size, tk.vocab_size, N_HEADS, N_LAYERS, N_EMBED, N_EMBED)
gexpr = GeneExpressionRegressor(tk.n_genes, N_EMBED, tk.n_genes)
gpt = gpt.to(DEVICE)
gexpr = gexpr.to(DEVICE)
optimizer = torch.optim.AdamW(
  list(gpt.parameters()) + list(gexpr.parameters()), 
  lr=LEARNING_RATE
)

> MicroSCGPT: Model initialized with 5659648 parameters.


In [6]:
print(f'> Starting training on {DEVICE}: {torch.cuda.get_device_name(0)}.')

rec_loss = []
for step in range(N_TRAIN_STEPS):
  if not (step % INTERVAL_EVAL) or step == N_TRAIN_STEPS - 1:
    loss = estimate_loss(gpt, gexpr, N_EVAL)
    print(f'Step {step}, loss={loss["train"]:.4f} (train), {loss["val"]:.4f} (val)')
  x_gid, x_bin, x_cnt = dh.get_batch("train")
  x_emb = gpt(x_gid, x_bin)
  estimate = gexpr(x_emb)
  loss = gene_expression_loss(x_cnt, estimate)
  rec_loss.append(float(loss.detach()))
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

> Starting training on cuda: Radeon RX 7900 XT.


  return torch._transformer_encoder_layer_fwd(
  return torch._transformer_encoder_layer_fwd(


Step 0, loss=1.2058 (train), 0.9587 (val)


OutOfMemoryError: HIP out of memory. Tried to allocate 3.82 GiB. GPU 0 has a total capacity of 19.98 GiB of which 3.68 GiB is free. Of the allocated memory 14.06 GiB is allocated by PyTorch, and 1.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_HIP_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)