
We are building a lightweight code completion model (similar to a tiny GitHub Copilot). We have tokenized source code data where integers represent specific syntax tokens (e.g., `def`, `return`, `int`, etc.).

Your task is to implement the training pipeline for a 'Next Token Prediction' task. We have provided a synthetic dataset class. You need to implement a simple model and the training loop.

**Specific Requirements:**

1.  **Data Handling:** You must treat this as a Causal Language Modeling (CLM) task. The model predicts the next token.
    
2.  **The Model and training Loop:** Implement the model. the forward pass, loss calculation, backward pass, and parameter update.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.functional as F

class SyntheticCodeDataset(Dataset):
    def __init__(self, num_samples=1000, seq_len=10, vocab_size=100):
        # Random integers representing code tokens
        torch.manual_seed(42)
        self.data = torch.randint(0, vocab_size, (num_samples, seq_len))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


vocab_size = 100
dataset = SyntheticCodeDataset(vocab_size=vocab_size)

# TODO: Create dataloader use batch size of 32


data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

class TinyCodeModel(nn.Module):
    """
    A simple feed-forward network for token prediction.
    Architecture: Embedding -> Linear -> ReLU -> Linear
    """
    def __init__(self, vocab_size, embed_dim=32, hidden_dim=64):
        super().__init__()

        # TODO: Define the layers

        self.emb = Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.linear_1 = Linear(in_features=embed_dim, out_features=hidden_dim)
        self.linear_2 = Linear(in_features=hidden_dim, out_features=vocab_size)

    def forward(self, x):
        # TODO: Implement the forward pass

        # (x) : (b, num_tokens)

        x = self.emb(x)       # (b, num_tokens, embed_dim)
        x = self.linear_1(x)  # (b, num_tokens, hidden_dim)
        x = F.relu(x)         # (b, num_tokens, hidden_dim)
        x = self.linear_2(x)  # (b, num_tokens, vocab_size)

        return x

def train_code_completion_model(model, dataloader, epochs=3, learning_rate=0.001):
    """
    Implement the training loop for Next Token Prediction.
    """

    # TODOs:
    # Define Loss Function and Optimizer
    # Set the model to train mode
    # Prepare Inputs and Targets for Next Token Prediction
    # Complete the training loop

    # CE loss TODO
    # Optimer TODO


    for epoch in range(epochs):
        total_loss = 0

        for batch_idx, sequences in enumerate(dataloader):
            # sequences shape: [batch_size, seq_len]
            # [[10, 20, 30]]
            # [[20, 30, EOS]]

            # 1. zero out gradients
            optimizer.zero_grad()

            # 2. get logits
            logits = model(sequences[:, :-1]) # [batch_size, seq_len, vocab_size]

            # 3. best next token
            # 4. target  TODO:  handle EOS token
            target = sequences[:, 1:]       # [batch_size, seq_len]
            target = torch.flatten(target)

            # 5. get loss
            loss = loss_fn(logits, target)
            total_loss += loss.item()

            # 6. loss.backward
            loss.backward()

            # 7. step optimizer
            optimizer.step()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")


my_model = TinyCodeModel(vocab_size)
train_code_completion_model(my_model, data_loader)


