In [1]:
# Allow imports from the project directory
import sys
sys.path.insert(0, '../')

In [4]:
from data.glue_dataloader import gen_dataloaders
batch_size = 32
max_length = 128

train_dataloader, val_dataloader, vocab_size = gen_dataloaders(batch_size, max_length)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 1821/1821 [00:00<00:00, 14443.86 examples/s]


In [7]:
import tqdm
from model.tinygrad_impl.language_model import TinygradLanguageModel, TinygradLanguageModelConfig
from model.tinygrad_impl.transformer import TransformerConfig, TransformerDecoderLayerConfig
from model.tinygrad_impl.mlp import MLPConfig
from tinygrad import Tensor
import tinygrad


def step(model, X, Y, optim):
    optim.zero_grad()
    # loss = criterion(output.view(-1, tokenizer.vocab_size), labels.flatten()
    # TODO: debug tinygrad model
    loss = model(X).cross_entropy(Y).backward()
    optim.step()
    return loss

def do_train():
    model = TinygradLanguageModel(TinygradLanguageModelConfig(
        vocab_size=vocab_size,
        context_length=max_length,
        embedding_dim=8,
        transformer_config=TransformerConfig(
        num_decoder_layers=3,
        decoder_layer_config=TransformerDecoderLayerConfig(
            d_model=8,
            n_head=2,
            dim_feedforward=32,
            dropout_p=0.1,
            )
        ),
        mlp_config=MLPConfig(
            d_model=8,
            d_hidden=16,
            num_layers=3,
            dropout_p=0.1,
        )
    ))
    optimizer = tinygrad.nn.optim.Adam(tinygrad.nn.state.get_parameters(model))

    for epoch in range(2):
        Tensor.training = True  # makes dropout work
        train_loss = 0.0
        num_train_examples = 0
        num_batches = 0
        for batch in tqdm.tqdm(train_dataloader):
            inputs = batch[:, :-1]
            labels = batch[:, 1:]

            loss = step(model, inputs, labels, optimizer)
            
            train_loss += loss.item()
            num_train_examples += labels.numel()
            loss.backward()
            optimizer.step()
            num_batches += 1
            if num_batches > 5:
                break

        train_loss /= num_train_examples
        print(f'Epoch {epoch+1}, Training Loss: {train_loss:.4f}')

        Tensor.training = False
        total_correct = 0
        num_val_examples = 0
        num_batches = 0
        for batch in val_dataloader:
            inputs = batch[:, :-1]
            labels = batch[:, 1:]
            output = model(inputs)
            _, predicted = output.max(dim=2)
            total_correct += (predicted == labels).sum().item()
            num_val_examples += labels.numel()
            num_batches += 1
            if num_batches > 5:
                break
            accuracy = total_correct / num_val_examples
            print(f'Epoch {epoch+1}, Validation Accuracy: {accuracy:.4f}')

do_train()


  0%|          | 0/2105 [00:00<?, ?it/s]

torch.Size([32, 127]) torch.Size([32, 127])





Exception: stop here