# MIRAS Simple Runner

Minimal 4-cell notebook to run MIRAS language model:
1. **Config** - Set hyperparameters
2. **Init** - Load data and build model
3. **Train** - Train the model
4. **Generate** - Generate text

In [None]:
# === CONFIG ===

# Dataset
DATASET_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

# Model architecture
N_EMBD = 128          # Embedding dimension
N_LAYERS = 2          # Number of MIRAS blocks
BLOCK_SIZE = 64       # Context length
MEMORY_TYPE = 'deep'  # 'linear' or 'deep'
ATTENTIONAL_BIAS = 'l2'  # 'l2', 'lp', or 'huber'
RETENTION = 'l2'      # 'l2', 'kl', or 'elastic'

# Training
BATCH_SIZE = 32
MAX_ITERS = 5000
EVAL_INTERVAL = 500
LEARNING_RATE = 1e-3

# Generation
MAX_NEW_TOKENS = 200
TEMPERATURE = 1.0

In [None]:
# === INIT ===

import torch
import torch.nn as nn
import torch.nn.functional as F
import requests
from typing import Optional, Tuple, Callable

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# --- Data ---
text = requests.get(DATASET_URL).text
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

def get_batch(split):
    d = train_data if split == 'train' else val_data
    ix = torch.randint(len(d) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([d[i:i+BLOCK_SIZE] for i in ix]).to(device)
    y = torch.stack([d[i+1:i+BLOCK_SIZE+1] for i in ix]).to(device)
    return x, y

print(f"Vocab size: {vocab_size}, Train tokens: {len(train_data):,}")

# --- Model Components ---
def l2_loss(pred, target):
    return 0.5 * ((pred - target) ** 2).sum(dim=-1)

def lp_loss(pred, target, p=3):
    return (torch.abs(pred - target) ** p).sum(dim=-1)

def huber_loss(pred, target, delta):
    diff = pred - target
    abs_diff = torch.abs(diff)
    return torch.where(abs_diff <= delta, 0.5 * diff ** 2, delta * (abs_diff - 0.5 * delta)).sum(dim=-1)

def l2_retention_update(W, grad, alpha, eta):
    return alpha * W - eta * grad

def kl_retention_update(log_W, grad, alpha, eta, c=1.0):
    log_W_new = alpha * log_W - eta * grad
    return log_W_new, c * F.softmax(log_W_new, dim=-1)

def elastic_net_update(W, grad, lambda_decay, zeta_lr, gamma_l1):
    z = lambda_decay * W - zeta_lr * grad
    return torch.sign(z) * F.relu(torch.abs(z) - gamma_l1)

class KeyValueProjection(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_K = nn.Linear(d_in, d_out, bias=False)
        self.W_V = nn.Linear(d_in, d_out, bias=False)
        self.W_Q = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        return self.W_K(x), self.W_V(x), self.W_Q(x)

class MIRASLayer(nn.Module):
    def __init__(self, d, memory_type='deep', attentional_bias='l2', retention='l2', expansion=4, p=3, q=4):
        super().__init__()
        self.d, self.memory_type, self.attentional_bias, self.retention = d, memory_type, attentional_bias, retention
        self.p, self.q = p, q
        self.kv_proj = KeyValueProjection(d, d)

        if memory_type == 'linear':
            self.register_buffer('M_init', torch.zeros(d, d))
        else:
            self.W1_init = nn.Parameter(torch.randn(d, d * expansion) * 0.02)
            self.W2_init = nn.Parameter(torch.randn(d * expansion, d) * 0.02)
            self.ln = nn.LayerNorm(d)

        if attentional_bias == 'huber':
            self.delta_proj = nn.Linear(d, 1)

        self.alpha = nn.Parameter(torch.ones(1) * 0.9)
        self.eta = nn.Parameter(torch.ones(1) * 0.1)
        if retention == 'kl':
            self.c = nn.Parameter(torch.ones(1))
        if retention == 'elastic':
            self.gamma = nn.Parameter(torch.ones(1) * 0.01)

    def memory_forward_deep(self, x, W1, W2):
        h = F.gelu(x @ W2.transpose(-2, -1))
        return x + self.ln(h @ W1.transpose(-2, -1))

    def get_loss(self, pred, target, x_t=None):
        if self.attentional_bias == 'l2':
            return l2_loss(pred, target).sum()
        elif self.attentional_bias == 'lp':
            return lp_loss(pred, target, self.p).sum()
        else:
            return huber_loss(pred, target, F.softplus(self.delta_proj(x_t))).sum()

    def apply_retention(self, W, grad, log_W=None):
        alpha, eta = torch.sigmoid(self.alpha), F.softplus(self.eta)
        if self.retention == 'l2':
            return l2_retention_update(W, grad, alpha, eta), None
        elif self.retention == 'kl':
            log_W = log_W if log_W is not None else torch.log(W.clamp(min=1e-10))
            log_W_new, W_new = kl_retention_update(log_W, grad, alpha, eta, self.c)
            return W_new, log_W_new
        else:
            return elastic_net_update(W, grad, alpha, eta, self.gamma), None

    def forward(self, x):
        k, v, q = self.kv_proj(x)
        B, T, D = k.shape
        outputs = []

        with torch.enable_grad():
            if self.memory_type == 'linear':
                M = self.M_init.unsqueeze(0).expand(B, -1, -1).contiguous()
                for t in range(T):
                    k_t, v_t, q_t = k[:, t], v[:, t], q[:, t]
                    M_leaf = M.detach().requires_grad_(True)
                    pred = torch.einsum('bde,be->bd', M_leaf, k_t)
                    loss = self.get_loss(pred, v_t, x[:, t] if self.attentional_bias == 'huber' else None)
                    grad = torch.autograd.grad(loss, M_leaf)[0]
                    M, _ = self.apply_retention(M, grad)
                    outputs.append(torch.einsum('bde,be->bd', M, q_t))
            else:
                W1 = self.W1_init.unsqueeze(0).expand(B, -1, -1).contiguous()
                W2 = self.W2_init.unsqueeze(0).expand(B, -1, -1).contiguous()
                log_W1, log_W2 = None, None
                if self.retention == 'kl':
                    W1, W2 = F.softmax(W1, dim=-1), F.softmax(W2, dim=-1)
                    log_W1, log_W2 = torch.log(W1.clamp(min=1e-10)), torch.log(W2.clamp(min=1e-10))

                for t in range(T):
                    k_t, v_t, q_t = k[:, t], v[:, t], q[:, t]
                    W1_leaf, W2_leaf = W1.detach().requires_grad_(True), W2.detach().requires_grad_(True)
                    pred = self.memory_forward_deep(k_t.unsqueeze(1), W1_leaf, W2_leaf).squeeze(1)
                    loss = self.get_loss(pred, v_t, x[:, t] if self.attentional_bias == 'huber' else None)
                    grad1, grad2 = torch.autograd.grad(loss, [W1_leaf, W2_leaf])
                    W1, log_W1 = self.apply_retention(W1, grad1, log_W1)
                    W2, log_W2 = self.apply_retention(W2, grad2, log_W2)
                    outputs.append(self.memory_forward_deep(q_t.unsqueeze(1), W1.detach(), W2.detach()).squeeze(1))

        return torch.stack(outputs, dim=1)

class MIRASBlock(nn.Module):
    def __init__(self, d_model, memory_type, attentional_bias, retention, ffn_mult=4):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.memory = MIRASLayer(d_model, memory_type, attentional_bias, retention)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(nn.Linear(d_model, d_model * ffn_mult), nn.GELU(), nn.Linear(d_model * ffn_mult, d_model))

    def forward(self, x):
        x = x + self.memory(self.ln1(x))
        return x + self.ffn(self.ln2(x))

class MIRASLanguageModel(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, memory_type='deep', attentional_bias='l2', retention='l2', block_size=128):
        super().__init__()
        self.block_size = block_size
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(block_size, d_model)
        self.layers = nn.ModuleList([MIRASBlock(d_model, memory_type, attentional_bias, retention) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
        self.token_embedding.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None: torch.nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.token_embedding(idx) + self.position_embedding(torch.arange(T, device=idx.device))
        for layer in self.layers:
            x = layer(x)
        logits = self.lm_head(self.ln_f(x))
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) if targets is not None else None
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0):
        for _ in range(max_new_tokens):
            logits, _ = self(idx[:, -self.block_size:])
            probs = F.softmax(logits[:, -1, :] / temperature, dim=-1)
            idx = torch.cat((idx, torch.multinomial(probs, num_samples=1)), dim=1)
        return idx

# --- Build Model ---
model = MIRASLanguageModel(
    vocab_size=vocab_size, d_model=N_EMBD, n_layers=N_LAYERS,
    memory_type=MEMORY_TYPE, attentional_bias=ATTENTIONAL_BIAS,
    retention=RETENTION, block_size=BLOCK_SIZE
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# === TRAIN ===

@torch.no_grad()
def estimate_loss(eval_iters=50):
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

for iter in range(MAX_ITERS):
    if iter % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    _, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

losses = estimate_loss()
print(f"Final: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

In [None]:
# === GENERATE ===

context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE)
print(decode(generated[0].tolist()))

# Upload to HuggingFace

Upload the trained MIRAS model to HuggingFace Hub. 
Make sure you have:
1. A HuggingFace account
2. An access token with write permissions (get one from https://huggingface.co/settings/tokens)
3. `huggingface_hub` installed (`pip install huggingface_hub`)

In [None]:
# === UPLOAD CONFIG ===

HF_REPO_ID = "av-codes/miras-shakespeare"  # Change to your repo
HF_TOKEN = '...'  # Set your token here or use huggingface-cli login

In [None]:
# === UPLOAD TO HUGGINGFACE ===

import json
import tempfile
import os
from huggingface_hub import HfApi, create_repo
import inspect

api = HfApi(token=HF_TOKEN)

try:
    create_repo(HF_REPO_ID, token=HF_TOKEN, exist_ok=True)
    print(f"Repository '{HF_REPO_ID}' ready")
except Exception as e:
    print(f"Note: {e}")

with tempfile.TemporaryDirectory() as tmpdir:
    model_path = os.path.join(tmpdir, "model.pt")
    torch.save({
        'model_state_dict': model.state_dict(),
        'vocab_size': vocab_size,
        'd_model': N_EMBD,
        'n_layers': N_LAYERS,
        'block_size': BLOCK_SIZE,
        'memory_type': MEMORY_TYPE,
        'attentional_bias': ATTENTIONAL_BIAS,
        'retention': RETENTION,
    }, model_path)

    config = {
        'model_type': 'miras',
        'vocab_size': vocab_size,
        'd_model': N_EMBD,
        'n_layers': N_LAYERS,
        'block_size': BLOCK_SIZE,
        'memory_type': MEMORY_TYPE,
        'attentional_bias': ATTENTIONAL_BIAS,
        'retention': RETENTION,
        'chars': chars,
    }
    config_path = os.path.join(tmpdir, "config.json")
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)

    modeling_code = '''"""MIRAS Language Model - Custom Architecture"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional

def l2_loss(pred, target):
    return 0.5 * ((pred - target) ** 2).sum(dim=-1)

def lp_loss(pred, target, p=3):
    return (torch.abs(pred - target) ** p).sum(dim=-1)

def huber_loss(pred, target, delta):
    diff = pred - target
    abs_diff = torch.abs(diff)
    return torch.where(abs_diff <= delta, 0.5 * diff ** 2, delta * (abs_diff - 0.5 * delta)).sum(dim=-1)

def l2_retention_update(W, grad, alpha, eta):
    return alpha * W - eta * grad

def kl_retention_update(log_W, grad, alpha, eta, c=1.0):
    log_W_new = alpha * log_W - eta * grad
    return log_W_new, c * F.softmax(log_W_new, dim=-1)

def elastic_net_update(W, grad, lambda_decay, zeta_lr, gamma_l1):
    z = lambda_decay * W - zeta_lr * grad
    return torch.sign(z) * F.relu(torch.abs(z) - gamma_l1)


class KeyValueProjection(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_K = nn.Linear(d_in, d_out, bias=False)
        self.W_V = nn.Linear(d_in, d_out, bias=False)
        self.W_Q = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        return self.W_K(x), self.W_V(x), self.W_Q(x)


class MIRASLayer(nn.Module):
    def __init__(self, d, memory_type='deep', attentional_bias='l2', retention='l2', expansion=4, p=3, q=4):
        super().__init__()
        self.d, self.memory_type, self.attentional_bias, self.retention = d, memory_type, attentional_bias, retention
        self.p, self.q = p, q
        self.kv_proj = KeyValueProjection(d, d)

        if memory_type == 'linear':
            self.register_buffer('M_init', torch.zeros(d, d))
        else:
            self.W1_init = nn.Parameter(torch.randn(d, d * expansion) * 0.02)
            self.W2_init = nn.Parameter(torch.randn(d * expansion, d) * 0.02)
            self.ln = nn.LayerNorm(d)

        if attentional_bias == 'huber':
            self.delta_proj = nn.Linear(d, 1)

        self.alpha = nn.Parameter(torch.ones(1) * 0.9)
        self.eta = nn.Parameter(torch.ones(1) * 0.1)
        if retention == 'kl':
            self.c = nn.Parameter(torch.ones(1))
        if retention == 'elastic':
            self.gamma = nn.Parameter(torch.ones(1) * 0.01)

    def memory_forward_deep(self, x, W1, W2):
        h = F.gelu(x @ W2.transpose(-2, -1))
        return x + self.ln(h @ W1.transpose(-2, -1))

    def get_loss(self, pred, target, x_t=None):
        if self.attentional_bias == 'l2':
            return l2_loss(pred, target).sum()
        elif self.attentional_bias == 'lp':
            return lp_loss(pred, target, self.p).sum()
        else:
            return huber_loss(pred, target, F.softplus(self.delta_proj(x_t))).sum()

    def apply_retention(self, W, grad, log_W=None):
        alpha, eta = torch.sigmoid(self.alpha), F.softplus(self.eta)
        if self.retention == 'l2':
            return l2_retention_update(W, grad, alpha, eta), None
        elif self.retention == 'kl':
            log_W = log_W if log_W is not None else torch.log(W.clamp(min=1e-10))
            log_W_new, W_new = kl_retention_update(log_W, grad, alpha, eta, self.c)
            return W_new, log_W_new
        else:
            return elastic_net_update(W, grad, alpha, eta, self.gamma), None

    def forward(self, x):
        k, v, q = self.kv_proj(x)
        B, T, D = k.shape
        outputs = []

        with torch.enable_grad():
            if self.memory_type == 'linear':
                M = self.M_init.unsqueeze(0).expand(B, -1, -1).contiguous()
                for t in range(T):
                    k_t, v_t, q_t = k[:, t], v[:, t], q[:, t]
                    M_leaf = M.detach().requires_grad_(True)
                    pred = torch.einsum('bde,be->bd', M_leaf, k_t)
                    loss = self.get_loss(pred, v_t, x[:, t] if self.attentional_bias == 'huber' else None)
                    grad = torch.autograd.grad(loss, M_leaf)[0]
                    M, _ = self.apply_retention(M, grad)
                    outputs.append(torch.einsum('bde,be->bd', M, q_t))
            else:
                W1 = self.W1_init.unsqueeze(0).expand(B, -1, -1).contiguous()
                W2 = self.W2_init.unsqueeze(0).expand(B, -1, -1).contiguous()
                log_W1, log_W2 = None, None
                if self.retention == 'kl':
                    W1, W2 = F.softmax(W1, dim=-1), F.softmax(W2, dim=-1)
                    log_W1, log_W2 = torch.log(W1.clamp(min=1e-10)), torch.log(W2.clamp(min=1e-10))

                for t in range(T):
                    k_t, v_t, q_t = k[:, t], v[:, t], q[:, t]
                    W1_leaf, W2_leaf = W1.detach().requires_grad_(True), W2.detach().requires_grad_(True)
                    pred = self.memory_forward_deep(k_t.unsqueeze(1), W1_leaf, W2_leaf).squeeze(1)
                    loss = self.get_loss(pred, v_t, x[:, t] if self.attentional_bias == 'huber' else None)
                    grad1, grad2 = torch.autograd.grad(loss, [W1_leaf, W2_leaf])
                    W1, log_W1 = self.apply_retention(W1, grad1, log_W1)
                    W2, log_W2 = self.apply_retention(W2, grad2, log_W2)
                    outputs.append(self.memory_forward_deep(q_t.unsqueeze(1), W1.detach(), W2.detach()).squeeze(1))

        return torch.stack(outputs, dim=1)


class MIRASBlock(nn.Module):
    def __init__(self, d_model, memory_type, attentional_bias, retention, ffn_mult=4):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.memory = MIRASLayer(d_model, memory_type, attentional_bias, retention)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(nn.Linear(d_model, d_model * ffn_mult), nn.GELU(), nn.Linear(d_model * ffn_mult, d_model))

    def forward(self, x):
        x = x + self.memory(self.ln1(x))
        return x + self.ffn(self.ln2(x))


class MIRASLanguageModel(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, memory_type='deep', attentional_bias='l2', retention='l2', block_size=128):
        super().__init__()
        self.block_size = block_size
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(block_size, d_model)
        self.layers = nn.ModuleList([MIRASBlock(d_model, memory_type, attentional_bias, retention) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
        self.token_embedding.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                torch.nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.token_embedding(idx) + self.position_embedding(torch.arange(T, device=idx.device))
        for layer in self.layers:
            x = layer(x)
        logits = self.lm_head(self.ln_f(x))
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) if targets is not None else None
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0):
        for _ in range(max_new_tokens):
            logits, _ = self(idx[:, -self.block_size:])
            probs = F.softmax(logits[:, -1, :] / temperature, dim=-1)
            idx = torch.cat((idx, torch.multinomial(probs, num_samples=1)), dim=1)
        return idx


def load_miras_model(repo_id_or_path, device='cpu'):
    """Load a MIRAS model from HuggingFace Hub or local path."""
    import json
    from pathlib import Path

    if Path(repo_id_or_path).exists():
        base_path = Path(repo_id_or_path)
        config_path = base_path / "config.json"
        model_path = base_path / "model.pt"
    else:
        from huggingface_hub import hf_hub_download
        config_path = hf_hub_download(repo_id=repo_id_or_path, filename="config.json")
        model_path = hf_hub_download(repo_id=repo_id_or_path, filename="model.pt")

    with open(config_path) as f:
        config = json.load(f)

    model = MIRASLanguageModel(
        vocab_size=config['vocab_size'],
        d_model=config['d_model'],
        n_layers=config['n_layers'],
        memory_type=config['memory_type'],
        attentional_bias=config['attentional_bias'],
        retention=config['retention'],
        block_size=config['block_size'],
    )

    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    stoi = {ch: i for i, ch in enumerate(config['chars'])}
    itos = {i: ch for i, ch in enumerate(config['chars'])}
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])

    return model, encode, decode, config
'''

    modeling_path = os.path.join(tmpdir, "modeling_miras.py")
    with open(modeling_path, 'w') as f:
        f.write(modeling_code)

    readme_content = f"""# MIRAS Language Model

A character-level language model trained on Shakespeare using the MIRAS (Memory-Integrated Recurrent Attention System) architecture.

## Model Details
- **Embedding dimension**: {N_EMBD}
- **Layers**: {N_LAYERS}
- **Block size**: {BLOCK_SIZE}
- **Memory type**: {MEMORY_TYPE}
- **Attentional bias**: {ATTENTIONAL_BIAS}
- **Retention**: {RETENTION}
- **Vocabulary size**: {vocab_size}

## Installation

```bash
pip install torch huggingface_hub
```

## Usage

### Quick Start

```python
from huggingface_hub import hf_hub_download
import torch

# Download files
for f in ["modeling_miras.py", "model.pt", "config.json"]:
    hf_hub_download(repo_id="{HF_REPO_ID}", filename=f, local_dir="./miras")

# Import and load
import sys
sys.path.insert(0, "./miras")
from modeling_miras import load_miras_model

model, encode, decode, config = load_miras_model("./miras")
model.eval()

# Generate text
context = torch.zeros((1, 1), dtype=torch.long)
output = model.generate(context, max_new_tokens=200, temperature=0.8)
print(decode(output[0].tolist()))
```

### Using the Helper Function

```python
from modeling_miras import load_miras_model

# Load directly from Hub
model, encode, decode, config = load_miras_model("{HF_REPO_ID}")

# Generate
import torch
context = torch.zeros((1, 1), dtype=torch.long)
generated = model.generate(context, max_new_tokens=100)
print(decode(generated[0].tolist()))
```

## Files

- `model.pt` - Model weights and architecture config
- `config.json` - Full configuration including vocabulary
- `modeling_miras.py` - Complete model architecture code

## Training
Trained for {MAX_ITERS} iterations on the TinyShakespeare dataset.

## Architecture

MIRAS uses a novel memory-based attention mechanism with configurable:
- **Memory type**: `linear` (matrix memory) or `deep` (MLP memory)
- **Attentional bias**: `l2`, `lp`, or `huber` loss functions
- **Retention**: `l2`, `kl`, or `elastic` weight update rules
"""
    readme_path = os.path.join(tmpdir, "README.md")
    with open(readme_path, 'w') as f:
        f.write(readme_content)

    api.upload_folder(
        folder_path=tmpdir,
        repo_id=HF_REPO_ID,
        token=HF_TOKEN,
    )

print(f"âœ“ Model uploaded to https://huggingface.co/{HF_REPO_ID}")
print(f"  Includes: model.pt, config.json, modeling_miras.py, README.md")