# vAGI L-KAN train tren Modal.com (GPU A100 40GB)

Notebook nay chay truc tiep trong Modal Notebook runtime da cap GPU A100 40GB.

## Tham khao chinh
- Modal GPU docs (A100): https://modal.com/docs/guide/gpu
- Candle installation guide: https://huggingface.github.io/candle/guide/installation.html
- Candle CUDA feature flags: https://docs.rs/crate/candle-core/0.8.4/features
- TinyShakespeare corpus: https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

## Output
- Checkpoint duoc luu local trong notebook runtime de ban download.
- Khong su dung thu vien `modal` Python SDK.


In [None]:
import os
import pathlib
import re
import shutil
import subprocess
import sys
import urllib.request

def run(cmd: str, check: bool = True, cwd: pathlib.Path | None = None, env: dict | None = None):
    print(f"$ {cmd}")
    completed = subprocess.run(
        cmd,
        shell=True,
        text=True,
        capture_output=True,
        cwd=str(cwd) if cwd else None,
        env=env,
    )
    if completed.stdout:
        print(completed.stdout)
    if completed.stderr:
        print(completed.stderr)
    if check and completed.returncode != 0:
        raise RuntimeError(f"Command failed ({completed.returncode}): {cmd}")
    return completed

print("Python:", sys.version)


In [None]:
REPO_URL = "https://github.com/vietrix/vagi.git"
BRANCH = "main"
WORKDIR = pathlib.Path("/root/vagi")

DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
DATA_PATH = WORKDIR / "data" / "input.txt"

TRAIN_STEPS = 5_000
BATCH_SIZE = 32
SEQ_LEN = 64
OUTPUT_CONST_PATH = "models/lkan-genesis.safetensors"

EXPORT_DIR = pathlib.Path("/root/outputs")
EXPORT_NAME = "lkan-genesis-a10040gb.safetensors"

print("Repo:", REPO_URL)
print("Branch:", BRANCH)
print("Workdir:", WORKDIR)
print("Dataset:", DATA_PATH)
print("Train steps:", TRAIN_STEPS, "batch:", BATCH_SIZE, "seq_len:", SEQ_LEN)
print("Output const path:", OUTPUT_CONST_PATH)
print("Export file:", EXPORT_DIR / EXPORT_NAME)


In [None]:
gpu = run("nvidia-smi", check=False)
if gpu.returncode != 0:
    raise RuntimeError(
        "Khong tim thay GPU trong runtime hien tai. Hay chay notebook nay tren Modal runtime co A100 40GB."
    )

run("which nvcc", check=False)
run("nvcc --version", check=False)
run("df -h", check=False)


In [None]:
run("apt-get -y update")
run("apt-get -y install build-essential pkg-config libssl-dev curl git")

if not pathlib.Path("/root/.cargo/bin/rustup").exists():
    run("curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal")

os.environ["PATH"] = f"/root/.cargo/bin:{os.environ['PATH']}"
run("rustup default stable")
run("rustc --version")
run("cargo --version")


In [None]:
if WORKDIR.exists():
    print("Removing existing repo at", WORKDIR)
    shutil.rmtree(WORKDIR)

run(f"git clone --depth 1 --branch {BRANCH} {REPO_URL} {WORKDIR}")
run("git rev-parse --short HEAD", cwd=WORKDIR)
run("ls -la", cwd=WORKDIR, check=False)


In [None]:
DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
if not DATA_PATH.exists():
    print("Downloading dataset from", DATA_URL)
    urllib.request.urlretrieve(DATA_URL, DATA_PATH)

print("Dataset:", DATA_PATH)
print("Size bytes:", DATA_PATH.stat().st_size)
preview = DATA_PATH.read_text(encoding="utf-8", errors="ignore")[:400]
print("Preview:\n", preview)


In [None]:
def replace_const(text: str, name: str, value: str, as_str: bool = False) -> str:
    pattern = rf"const {name}: [^=]+ = [^;]+;"
    repl = f"const {name}: usize = {value};"
    if as_str:
        repl = f"const {name}: &str = \"{value}\";"
    updated, n = re.subn(pattern, repl, text, count=1)
    if n == 0:
        raise RuntimeError(f"Cannot find const {name} in train_lkan.rs")
    return updated

kernel_cargo = WORKDIR / "kernel" / "Cargo.toml"
cargo_text = kernel_cargo.read_text(encoding="utf-8")
m = re.search(r"^candle-core\s*=\s*(.+)$", cargo_text, flags=re.MULTILINE)
if not m:
    raise RuntimeError("Cannot find candle-core dependency")
current = m.group(0)
if 'features = ["cuda"]' not in current:
    vm = re.search(r"version\s*=\s*\"([^\"]+)\"", current) or re.search(r"\"([^\"]+)\"", current)
    if not vm:
        raise RuntimeError(f"Cannot parse candle-core version from: {current}")
    version = vm.group(1)
    newline = f"candle-core = {{ version = \"{version}\", features = [\"cuda\"] }}"
    kernel_cargo.write_text(cargo_text.replace(current, newline), encoding="utf-8")
    print("Patched", kernel_cargo)
else:
    print("candle-core already has cuda feature")

train_rs = WORKDIR / "kernel" / "src" / "bin" / "train_lkan.rs"
train_src = train_rs.read_text(encoding="utf-8")

train_src = replace_const(train_src, "OUTPUT_PATH", OUTPUT_CONST_PATH, as_str=True)
train_src = replace_const(train_src, "TRAIN_STEPS", str(TRAIN_STEPS))
train_src = replace_const(train_src, "BATCH_SIZE", str(BATCH_SIZE))
train_src = replace_const(train_src, "SEQ_LEN", str(SEQ_LEN))

train_src = train_src.replace("hidden_dim: 192,", "hidden_dim: 128,")
train_src = train_src.replace("in_dim: 192,", "in_dim: 128,")
train_src = train_src.replace("out_dim: 192,", "out_dim: 128,")

if "Device::new_cuda(0)" not in train_src:
    cpu_line = "let device = Device::Cpu;"
    device_block = """let device = match Device::new_cuda(0) {
        Ok(dev) => {
            println!(\"using CUDA device 0\");
            dev
        }
        Err(err) => {
            println!(\"CUDA unavailable ({err}), fallback to CPU\");
            Device::Cpu
        }
    };"""
    if cpu_line not in train_src:
        raise RuntimeError("Cannot patch device selection block in train_lkan.rs")
    train_src = train_src.replace(cpu_line, device_block)

train_rs.write_text(train_src, encoding="utf-8")
print("Patched", train_rs)


In [None]:
env = os.environ.copy()
env["PATH"] = f"/root/.cargo/bin:/usr/local/cuda/bin:{env.get('PATH', '')}"
env["CUDA_HOME"] = "/usr/local/cuda"
env["LD_LIBRARY_PATH"] = f"/usr/local/cuda/lib64:{env.get('LD_LIBRARY_PATH', '')}"

run("rustc --version", env=env)
run("cargo --version", env=env)
run("cargo run -p vagi-kernel --release --bin train_lkan", cwd=WORKDIR, env=env)


In [None]:
candidates = [
    WORKDIR / "models" / "lkan-genesis.safetensors",
    WORKDIR / "models" / "lkan-gen2.safetensors",
]
checkpoint = next((p for p in candidates if p.exists()), None)
if checkpoint is None:
    raise FileNotFoundError("Training finished but no checkpoint found in models/")

EXPORT_DIR.mkdir(parents=True, exist_ok=True)
export_path = EXPORT_DIR / EXPORT_NAME
shutil.copy2(checkpoint, export_path)

print("Checkpoint source:", checkpoint)
print("Checkpoint exported:", export_path)
print("Size MB:", round(export_path.stat().st_size / (1024 * 1024), 2))
run(f"sha256sum {export_path}", check=False)
print("Done. Download file tu", export_path)
