In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from transformers import AutoTokenizer


In [12]:
class RotaryEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))

    def forward(self, x):
        seq_len = x.shape[1]
        t = torch.arange(seq_len, device=x.device).float()
        sinusoid = torch.einsum('n , d -> n d', t, self.inv_freq)
        cos, sin = sinusoid.cos(), sinusoid.sin()
        return torch.cat([cos, sin], dim=-1)


In [13]:
class ScaledDotProductAttention(nn.Module):
    def forward(self, query, key, value, mask=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
        if mask is not None:
            scores = scores.masked_fill(mask.unsqueeze(1).unsqueeze(2) == 0, -1e9)
        attn = F.softmax(scores, dim=-1)
        return torch.matmul(attn, value)


In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, embed_dim = x.size()
        qkv = self.qkv_proj(x).view(batch_size, seq_len, self.num_heads, 3 * self.head_dim)
        query, key, value = torch.chunk(qkv, 3, dim=-1)

        # Transpose for multi-head attention
        query = query.permute(0, 2, 1, 3)
        key = key.permute(0, 2, 1, 3)
        value = value.permute(0, 2, 1, 3)

        # Scaled Dot-Product Attention
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            scores = scores.masked_fill(mask.unsqueeze(1).unsqueeze(2) == 0, -1e9)
        attn = F.softmax(scores, dim=-1)

        # Multiply attention weights with value
        attn_output = torch.matmul(attn, value)
        attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
        attn_output = attn_output.view(batch_size, seq_len, embed_dim)
        return self.out_proj(attn_output)


In [5]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion_factor=4):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, expansion_factor * embed_dim)
        self.fc2 = nn.Linear(expansion_factor * embed_dim, embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.gelu(x)
        x = self.dropout(x)
        return self.fc2(x)


In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.mha = MultiHeadAttention(embed_dim, num_heads)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.ffn = FeedForward(embed_dim)

    def forward(self, x, mask=None):
        x = x + self.mha(self.ln1(x), mask)
        x = x + self.ffn(self.ln2(x))
        return x


In [7]:
class LLaMA(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, num_heads) for _ in range(num_layers)])
        self.ln = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, mask)
        x = self.ln(x)
        return self.fc(x)


In [15]:
# Hyperparameters
vocab_size = 30000
embed_dim = 512
num_heads = 8
num_layers = 6

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Model
model = LLaMA(vocab_size, embed_dim, num_heads, num_layers)

# Example input
input_text = ["Deep learning is amazing", "Transformers are powerful"]
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128).input_ids


# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Forward pass
output = model(inputs)
loss = criterion(output.view(-1, vocab_size), inputs.view(-1))

loss.backward()
optimizer.step()
