In [120]:
!pip install tiktoken



In [121]:
import tiktoken
import torch


class TokenizationLayer:
    def __init__(self, model_name="cl100k_base", device="cuda" if torch.cuda.is_available() else "cpu"):
        """
        Tokenization Layer using tiktoken.

        Args:
            model_name (str): Name of the tokenizer model. Default is 'cl100k_base'.
            device (str): Device to run on ('cuda' or 'cpu').
        """
        self.device = torch.device(device)

        # Load tokenizer
        self.tokenizer = tiktoken.get_encoding(model_name)
        self.vocab_size = self.tokenizer.n_vocab

        # Define custom tokens (CLS and SEP)
        self.cls_token = "<|cls|>"
        self.sep_token = "<|sep|>"

        # Manually assign token IDs for custom tokens
        try:
            self.cls_token_id = self.tokenizer.encode(self.cls_token)[0]
        except KeyError:
            print(f"Warning: {self.cls_token} not found in tokenizer. Using eot_token instead.")
            self.cls_token_id = self.tokenizer.eot_token

        try:
            self.sep_token_id = self.tokenizer.encode(self.sep_token)[0]
        except KeyError:
            print(f"Warning: {self.sep_token} not found in tokenizer. Using eot_token instead.")
            self.sep_token_id = self.tokenizer.eot_token

        # Use pad_token if available, else use eot_token
        self.pad_token_id = self.tokenizer.eot_token  # Default to EOT token
        if hasattr(self.tokenizer, "pad_token"):
            self.pad_token_id = self.tokenizer.pad_token

    def tokenize(self, texts, max_length=512, add_special_tokens=True):
        """
        Tokenizes input texts into token IDs with optional padding, truncation, and special tokens.

        Args:
            texts (str or List[str]): Input text or list of texts to be tokenized.
            max_length (int): Maximum sequence length. Default is 512.
            add_special_tokens (bool): Whether to add special tokens (CLS, SEP). Default is True.

        Returns:
            torch.Tensor: Token IDs with shape [batch_size, max_length] on the specified device.
        """
        if isinstance(texts, str):  # Handle single text input
            texts = [texts]

        token_ids = []
        for text in texts:
            # Tokenize text
            tokens = self.tokenizer.encode(text)

            tokens = [min(token, self.vocab_size - 1) for token in tokens]

            # Add special tokens (CLS and SEP)
            if add_special_tokens:
                tokens = [self.cls_token_id] + tokens + [self.sep_token_id]

            # Truncate if necessary
            tokens = tokens[:max_length]

            # Pad if necessary
            if len(tokens) < max_length:
                tokens += [self.pad_token_id] * (max_length - len(tokens))

            token_ids.append(tokens)

        return torch.tensor(token_ids, dtype=torch.long, device=self.device)  # Send to GPU

    def detokenize(self, tokens):
        """
        Converts token IDs back to text.

        Args:
            tokens (List[int] or torch.Tensor): Tokenized input.

        Returns:
            str or List[str]: Decoded text or list of texts.
        """
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().tolist()  # Convert to list and move to CPU if tensor

        if isinstance(tokens[0], list):  # Handle batch input
            return [self.tokenizer.decode(t) for t in tokens]
        else:  # Handle single input
            return self.tokenizer.decode(tokens)


# ✅ CUDA Check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using Device:", device)

# ✅ cuDNN Acceleration Check
if torch.backends.cudnn.is_available():
    print("cuDNN Enabled:", torch.backends.cudnn.enabled)

# ✅ Testing Tokenization Layer
tokenizer_layer = TokenizationLayer(device=device)

# Single Text Example
text = "Bhai, hum Layer 1 ka Tokenization implement kar rahe hain!"
tokens = tokenizer_layer.tokenize(text, max_length=10, add_special_tokens=True)  # Shape: [1, max_length]
decoded_text = tokenizer_layer.detokenize(tokens)

# Batch Text Example
texts = [
    "Bhai, hum Layer 1 ka Tokenization implement kar rahe hain!",
    "Ye code ekdam perfect hai!"
]
batch_tokens = tokenizer_layer.tokenize(texts, max_length=10, add_special_tokens=True)  # Shape: [batch_size, max_length]
decoded_texts = tokenizer_layer.detokenize(batch_tokens)

# ✅ Outputs
print("\nSingle Text Example:")
print("Input Text:", text)
print("Tokenized Output (Shape: {}):".format(tokens.shape), tokens)
print("Decoded Text:", decoded_text)

print("\nBatch Text Example:")
print("Input Texts:", texts)
print("Tokenized Output (Shape: {}):".format(batch_tokens.shape), batch_tokens)
print("Decoded Texts:", decoded_texts)


Using Device: cpu
cuDNN Enabled: True

Single Text Example:
Input Text: Bhai, hum Layer 1 ka Tokenization implement kar rahe hain!
Tokenized Output (Shape: torch.Size([1, 10])): tensor([[   27,    33, 26279,    11,  2854, 23570,   220,    16, 16909,  9857]])
Decoded Text: ['<Bhai, hum Layer 1 ka Token']

Batch Text Example:
Input Texts: ['Bhai, hum Layer 1 ka Tokenization implement kar rahe hain!', 'Ye code ekdam perfect hai!']
Tokenized Output (Shape: torch.Size([2, 10])): tensor([[    27,     33,  26279,     11,   2854,  23570,    220,     16,  16909,
           9857],
        [    27,  87575,   2082,  27955,  15770,   4832,  47151,      0,     27,
         100257]])
Decoded Texts: ['<Bhai, hum Layer 1 ka Token', '<Ye code ekdam perfect hai!<<|endoftext|>']


In [122]:
import torch
import torch.nn as nn
from torch.amp import autocast
# from TokenizationLayer import TokenizationLayer

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, padding_idx=0, dropout=0.1):
        """
        Token Embedding Layer using PyTorch nn.Embedding.

        Args:
            vocab_size (int): Number of unique tokens in vocabulary.
            embed_dim (int): Dimension of each token embedding.
            padding_idx (int, optional): Index of padding token. Default: 0.
            dropout (float, optional): Dropout probability. Default: 0.1.
        """
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            padding_idx=padding_idx  # Helps handle padding tokens efficiently
        )
        self.dropout = nn.Dropout(dropout)  # Dropout for regularization
        nn.init.xavier_uniform_(self.embedding.weight)  # Better initialization

    def forward(self, input_tokens):
        """
        Forward pass to convert token IDs to embeddings.

        Args:
            input_tokens (torch.Tensor): Tensor of shape (batch_size, seq_len).

        Returns:
            torch.Tensor: Token embeddings of shape (batch_size, seq_len, embed_dim).
        """
        # Ensure input is on the correct device
        input_tokens = input_tokens.to(self.embedding.weight.device)

        # Apply embedding and dropout

        embeddings = self.embedding(input_tokens)
        embeddings = self.dropout(embeddings)  # Apply dropout

        return embeddings


# ✅ Hyperparameters
vocab_size = 10000  # Size of vocabulary
embed_dim = 512  # Embedding dimension per token
batch_size = 8  # Number of sequences processed in parallel
seq_len = 128  # Max sequence length



tokenizer_layer = TokenizationLayer()
# ✅ Initialize Token Embedding Layer
# ✅ Initialize Token Embedding Layer with correct vocab size
token_embedding = TokenEmbedding(
    vocab_size=tokenizer_layer.vocab_size,  # Use actual vocab size
    embed_dim=512
).to(device)

# ✅ Example Input (Random Token IDs)
input_tokens = torch.randint(0, vocab_size, (batch_size, seq_len), device=device, dtype=torch.long)

# ✅ Apply Token Embedding
output_embeddings = token_embedding(input_tokens)

# ✅ Debugging Info
print("\n✅ Input Tokens Shape:", input_tokens.shape)  # Expected: (8, 128)
print("✅ Token Embedding Output Shape:", output_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Token Embedding Output dtype:", output_embeddings.dtype)  # Should be float16 if AMP enabled





✅ Input Tokens Shape: torch.Size([8, 128])
✅ Token Embedding Output Shape: torch.Size([8, 128, 512])
✅ Token Embedding Output dtype: torch.float32


In [123]:
import torch
import torch.nn as nn


device = 'cuda' if torch.cuda.is_available() else 'cpu'




class RotaryPositionalEncoding(nn.Module):
    def __init__(self, embed_dim):
        """
        Rotary Positional Encoding (RoPE) for transformers.

        Args:
            embed_dim (int): Dimension of token embeddings.
        """
        super(RotaryPositionalEncoding, self).__init__()
        self.embed_dim = embed_dim

        # Compute inverse frequency terms for RoPE
        inv_freq = 1.0 / (10000 ** (torch.arange(0, embed_dim, 2, dtype=torch.float32) / embed_dim))
        self.register_buffer("inv_freq", inv_freq)  # Store as buffer

    def rotate_half(self, x):
        """
        Rotates the last dimension by 90 degrees.

        Args:
            x (torch.Tensor): Input tensor of shape (..., embed_dim).

        Returns:
            torch.Tensor: Rotated tensor of same shape.
        """
        x1, x2 = x.chunk(2, dim=-1)
        return torch.cat((-x2, x1), dim=-1)

    def forward(self, x):
        """
        Forward pass for RoPE.

        Args:
            x (torch.Tensor): Token embeddings of shape (batch_size, seq_len, embed_dim).

        Returns:
            torch.Tensor: Rotated embeddings with positional information.
        """
        batch_size, seq_len, embed_dim = x.shape

        # Generate position indices
        positions = torch.arange(seq_len, dtype=torch.float32, device=x.device).unsqueeze(1)

        # Compute rotation frequencies
        freqs = torch.matmul(positions, self.inv_freq.unsqueeze(0))  # Shape: [seq_len, embed_dim//2]
        emb = torch.cat((freqs, freqs), dim=-1)  # Shape: [seq_len, embed_dim]

        # Compute cos and sin embeddings
        cos_emb, sin_emb = emb.cos().unsqueeze(0), emb.sin().unsqueeze(0)  # Shape: [1, seq_len, embed_dim]

        # Apply RoPE transformation
        x_rotated = (x * cos_emb) + (self.rotate_half(x) * sin_emb)

        return x_rotated


# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512

# ✅ Initialize RoPE
rotary_pe = RotaryPositionalEncoding(embed_dim).to(device)

# ✅ Example Input (Random Token Embeddings)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Apply RoPE
output_embeddings = rotary_pe(input_embeddings)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ RoPE Output Shape:", output_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ RoPE Output dtype:", output_embeddings.dtype)  # Expected: float32

✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ RoPE Output Shape: torch.Size([8, 128, 512])
✅ RoPE Output dtype: torch.float32


In [124]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'



# ✅ Optimized Layer Normalization (Pre-LN)
# Define LayerNorm (if not already defined)
class LayerNorm(nn.Module):
    def __init__(self, embed_dim, eps=1e-5):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(embed_dim, dtype=torch.float32))  # Learnable scale
        self.beta = nn.Parameter(torch.zeros(embed_dim, dtype=torch.float32))  # Learnable shift
        self.eps = eps  # Small value for numerical stability

    def forward(self, x):
        # Apply Layer Normalization
        return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta, self.eps)

# Define ResidualConnection
class ResidualConnection(nn.Module):
    def __init__(self, embed_dim, dropout=0.1):
        super(ResidualConnection, self).__init__()
        self.norm = LayerNorm(embed_dim)  # Pre-LayerNorm
        self.dropout = nn.Dropout(dropout)  # Regularization

    def forward(self, x, sublayer):
        # Apply Pre-LayerNorm, sublayer, and residual connection
        return x + self.dropout(sublayer(self.norm(x)))

# Define MultiHeadSelfAttention (if not already defined)
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads  # Ensure num_heads is set as an attribute
        self.head_dim = embed_dim // num_heads

        assert self.head_dim * num_heads == embed_dim, "Embedding dimension must be divisible by number of heads"

        # Fused QKV Projection (Single Linear Layer for Efficiency)
        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim, dtype=torch.float32)
        self.out_proj = nn.Linear(embed_dim, embed_dim, dtype=torch.float32)

    def forward(self, x, mask=None):
        batch_size, seq_len, embed_dim = x.shape

        # Compute Q, K, V in a single pass
        qkv = self.qkv_proj(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        Q, K, V = qkv.unbind(dim=2)  # Split into separate tensors

        # Reshape for multi-head attention
        Q = Q.transpose(1, 2)  # Shape: (batch, num_heads, seq_len, head_dim)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        # Flash Attention (Optimized Scaled Dot-Product Attention)
        if mask is not None:
            mask = mask.to(dtype=torch.float16, device=x.device)  # Ensure mask is on the correct device and dtype
        output = F.scaled_dot_product_attention(Q, K, V, attn_mask=mask)

        # Reshape back to original shape
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)

        # Apply output projection
        return self.out_proj(output)



# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512
num_heads = 8

# ✅ Initialize Layers
self_attention = MultiHeadSelfAttention(embed_dim, num_heads).to(device)
residual_connection = ResidualConnection(embed_dim).to(device)

# ✅ Example Input (Random Token Embeddings)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Causal Mask (For Decoder)
mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.float32, device=device)).unsqueeze(0).unsqueeze(0)

# ✅ Apply Masked Multi-Head Self-Attention with Residual Connection
output_embeddings = residual_connection(input_embeddings, self_attention)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Mask Shape:", mask.shape)  # Expected: (1, 1, 128, 128)
print("✅ Self-Attention Output Shape:", output_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Output dtype:", output_embeddings.dtype)  # Expected: float32

✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ Mask Shape: torch.Size([1, 1, 128, 128])
✅ Self-Attention Output Shape: torch.Size([8, 128, 512])
✅ Output dtype: torch.float32


In [125]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#from MultiHeadSelfAttention import MultiHeadSelfAttention



device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ✅ Optimized Layer Normalization (Pre-LN)
class LayerNorm(nn.Module):
    def __init__(self, embed_dim, eps=1e-5):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(embed_dim, dtype=torch.float32))  # Learnable scale
        self.beta = nn.Parameter(torch.zeros(embed_dim, dtype=torch.float32))  # Learnable shift
        self.eps = eps  # Small value for numerical stability

    def forward(self, x):
        # Apply Layer Normalization
        return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta, self.eps)

# ✅ Optimized Feedforward Network (FFN)
class FeedforwardNetwork(nn.Module):
    def __init__(self, embed_dim, hidden_dim, dropout=0.1):
        """
        Feedforward Network with GELU Activation and Dropout.
        """
        super(FeedforwardNetwork, self).__init__()
        self.linear1 = nn.Linear(embed_dim, hidden_dim, dtype=torch.float32)  # Expansion
        self.linear2 = nn.Linear(hidden_dim, embed_dim, dtype=torch.float32)  # Compression
        self.dropout = nn.Dropout(dropout)  # Regularization
        self.activation = nn.GELU()  # Activation function

    def forward(self, x):
        x = self.linear1(x)  # Expand dimensions
        x = self.activation(x)  # Apply GELU
        x = self.dropout(x)  # Apply dropout
        x = self.linear2(x)  # Compress dimensions
        return x

# ✅ Optimized Residual Connection with Pre-Norm
class ResidualConnection(nn.Module):
    def __init__(self, embed_dim, dropout=0.1):
        """
        Residual Connection with Pre-Norm (Better Stability).
        """
        super(ResidualConnection, self).__init__()
        self.norm = LayerNorm(embed_dim)  # Pre-LayerNorm
        self.dropout = nn.Dropout(dropout)  # Regularization

    def forward(self, x, sublayer):
        # Apply Pre-LayerNorm, sublayer, and residual connection
        return x + self.dropout(sublayer(self.norm(x)))



# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512
hidden_dim = 2048  # FFN hidden dimension
num_heads = 8

# ✅ Initialize Layers
self_attention = MultiHeadSelfAttention(embed_dim, num_heads).to(device)
ffn = FeedforwardNetwork(embed_dim, hidden_dim).to(device)
residual_connection1 = ResidualConnection(embed_dim).to(device)
residual_connection2 = ResidualConnection(embed_dim).to(device)

# ✅ Example Input (Random Token Embeddings)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Causal Mask (For Decoder)
mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.float32, device=device)).unsqueeze(0).unsqueeze(0)

# ✅ Apply Masked Multi-Head Self-Attention with Residual Connection
attention_output = residual_connection1(input_embeddings, self_attention)

# ✅ Apply Feedforward Network with Residual Connection
ffn_output = residual_connection2(attention_output, ffn)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Attention Output Shape:", attention_output.shape)  # Expected: (8, 128, 512)
print("✅ FFN Output Shape:", ffn_output.shape)  # Expected: (8, 128, 512)
print("✅ Output dtype:", ffn_output.dtype)  # Expected: float32

✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ Attention Output Shape: torch.Size([8, 128, 512])
✅ FFN Output Shape: torch.Size([8, 128, 512])
✅ Output dtype: torch.float32


In [126]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#from FeedforwardNetwork import LayerNorm,MultiHeadSelfAttention,ResidualConnection,FeedforwardNetwork

device = 'cuda' if torch.cuda.is_available() else 'cpu'





# ✅ Layer 1: Sublayers
class Layer1(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.1):
        super(Layer1, self).__init__()
        # Sublayers
        self.pre_layer_norm = LayerNorm(embed_dim)  # Pre-LayerNorm
        self.self_attention = MultiHeadSelfAttention(embed_dim, num_heads)  # Self-Attention
        self.residual1 = ResidualConnection(embed_dim, dropout)  # Residual Connection 1
        self.layer_norm1 = LayerNorm(embed_dim)  # LayerNorm after Self-Attention
        self.ffn = FeedforwardNetwork(embed_dim, hidden_dim, dropout)  # Feedforward Network
        self.residual2 = ResidualConnection(embed_dim, dropout)  # Residual Connection 2
        self.layer_norm2 = LayerNorm(embed_dim)  # LayerNorm after FFN

    def forward(self, x):
        # Sublayer 1: Self-Attention with Residual Connection
        x = self.residual1(x, self.self_attention)  # Self-Attention + Residual
        x = self.layer_norm1(x)  # LayerNorm after Self-Attention

        # Sublayer 2: Feedforward Network with Residual Connection
        x = self.residual2(x, self.ffn)  # FFN + Residual
        x = self.layer_norm2(x)  # LayerNorm after FFN

        return x


# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512
hidden_dim = 2048  # FFN hidden dimension
num_heads = 8

# ✅ Initialize Layer 1
layer1 = Layer1(embed_dim, hidden_dim, num_heads).to(device)

# ✅ Example Input (Random Token Embeddings)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Apply Layer 1
output = layer1(input_embeddings)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Layer 1 Output Shape:", output.shape)  # Expected: (8, 128, 512)
print("✅ Output dtype:", output.dtype)  # Expected: float32

✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ Layer 1 Output Shape: torch.Size([8, 128, 512])
✅ Output dtype: torch.float32


In [127]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class Layer2WithMemory(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_heads, memory_size, dropout=0.1):
        super(Layer2WithMemory, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads  # Ensure num_heads is set as an attribute
        self.memory_size = memory_size

        # Sublayers
        self.pre_layer_norm = LayerNorm(embed_dim)
        self.self_attention = MultiHeadSelfAttention(embed_dim, num_heads)  # Pass num_heads here
        self.residual1 = ResidualConnection(embed_dim, dropout)
        self.layer_norm1 = LayerNorm(embed_dim)

        # Memory Module
        self.memory_bank = nn.Parameter(torch.zeros(memory_size, embed_dim, dtype=torch.float32))  # Initialize with zeros
        self.memory_norm = LayerNorm(embed_dim)
        self.memory_attention = MultiHeadSelfAttention(embed_dim, num_heads)  # Pass num_heads here

        # Learnable Memory Update Mechanism
        self.memory_gate = nn.Linear(embed_dim, 1)  # Learnable gate
        self.sigmoid = nn.Sigmoid()

        # Feedforward Network
        self.ffn = FeedforwardNetwork(embed_dim, hidden_dim, dropout)
        self.residual2 = ResidualConnection(embed_dim, dropout)
        self.layer_norm2 = LayerNorm(embed_dim)

    def forward(self, x):
        batch_size, seq_len, embed_dim = x.shape
        num_heads = self.memory_attention.num_heads
        head_dim = embed_dim // num_heads

        # Sublayer 1: Self-Attention with Residual Connection
        x = self.residual1(x, self.self_attention)
        x = self.layer_norm1(x)

        # Step 1: Memory Read
        memory_bank = self.memory_bank.unsqueeze(0).expand(batch_size, -1, -1)  # (batch, memory_size, embed_dim)
        memory_bank = memory_bank.type(x.dtype)
        memory_bank = memory_bank.view(batch_size, self.memory_size, num_heads, head_dim)  # Reshape
        memory_bank = memory_bank.permute(0, 2, 1, 3)  # (batch, num_heads, memory_size, head_dim)

        Q = x.view(batch_size, seq_len, num_heads, head_dim).permute(0, 2, 1, 3)  # (batch, num_heads, seq_len, head_dim)

        # Fixed Memory Attention
        memory_output = F.scaled_dot_product_attention(Q, memory_bank, memory_bank)
        memory_output = memory_output.permute(0, 2, 1, 3).reshape(batch_size, seq_len, embed_dim)  # Reshape back

        # Learnable Gating for Memory Update
        gate_weight = self.sigmoid(self.memory_gate(x))
        memory_output = gate_weight * memory_output

        x = x + memory_output  # Integrate memory output

        # Step 2: Memory Write (Dynamic Update)
        self.update_memory(x)

        # Sublayer 2: Feedforward Network with Residual Connection
        x = self.residual2(x, self.ffn)
        x = self.layer_norm2(x)

        return x  # ✅ Fixed Output

    def update_memory(self, x):
        with torch.no_grad():  # Ensure memory update doesn't affect gradients
            # Shift memory bank (oldest memory slot is replaced)
            updated_memory_bank = torch.roll(self.memory_bank, shifts=-1, dims=0)

            # Aggregate information from current input
            update_value = x.mean(dim=1)  # Mean pooling over sequence length, shape: [batch_size, embed_dim]

            # Learnable update factor
            gate = self.sigmoid(self.memory_gate(update_value))  # Shape: [batch_size, 1]
            gate = gate.expand_as(update_value)  # Shape: [batch_size, embed_dim]

            # Expand self.memory_bank[0] to match the batch dimension
            memory_slot = updated_memory_bank[0].unsqueeze(0).expand_as(update_value)  # Shape: [batch_size, embed_dim]

            # Smooth update: Blend new information with existing memory
            updated_memory_slot = (gate * update_value) + (1 - gate) * memory_slot

            # Update the memory bank
            updated_memory_bank[0] = updated_memory_slot.mean(dim=0)  # Average over batch dimension

            # Assign the updated memory bank
            self.memory_bank.data = updated_memory_bank.data  # Non-inplace update


# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512
hidden_dim = 2048  # FFN hidden dimension
num_heads = 8
memory_size = 100  # Memory bank size

# ✅ Initialize Layer 2 with Memory
layer2_with_memory = Layer2WithMemory(embed_dim, hidden_dim, num_heads, memory_size).to(device)

# ✅ Example Input (Layer 1 ka output)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Apply Layer 2 with Memory
output = layer2_with_memory(input_embeddings)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Layer 2 Output Shape:", output.shape)  # Expected: (8, 128, 512)
print("✅ Output dtype:", output.dtype)  # Expected: float32

✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ Layer 2 Output Shape: torch.Size([8, 128, 512])
✅ Output dtype: torch.float32


In [128]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ✅ Multi-Head Self-Attention Layer
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads  # Store num_heads as an attribute
        self.head_dim = embed_dim // num_heads

        assert self.head_dim * num_heads == embed_dim, "Embedding dimension must be divisible by number of heads"

        # Fused QKV Projection (Single Linear Layer for Efficiency)
        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim, dtype=torch.float32)
        self.out_proj = nn.Linear(embed_dim, embed_dim, dtype=torch.float32)

    def forward(self, x, mask=None):
        batch_size, seq_len, embed_dim = x.shape

        # Compute Q, K, V in a single pass
        qkv = self.qkv_proj(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        Q, K, V = qkv.unbind(dim=2)  # Split into separate tensors

        # Reshape for multi-head attention
        Q = Q.transpose(1, 2)  # Shape: (batch, num_heads, seq_len, head_dim)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        # Flash Attention (Optimized Scaled Dot-Product Attention)
        if mask is not None:
            mask = mask.to(dtype=x.dtype, device=x.device)  # Ensure mask is on the correct device and dtype
        output = F.scaled_dot_product_attention(Q, K, V, attn_mask=mask)

        # Reshape back to original shape
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)

        # Apply output projection
        return self.out_proj(output)

# ✅ Chunked Attention (Fixed)
class ChunkedAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, chunk_size=32):
        super(ChunkedAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.chunk_size = chunk_size
        self.self_attention = MultiHeadSelfAttention(embed_dim, num_heads)  # Self-Attention

    def forward(self, x):
        batch_size, seq_len, embed_dim = x.shape

        # Step 1: Pad input if necessary
        pad_len = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
        x = F.pad(x, (0, 0, 0, pad_len))  # Pad along seq_len
        seq_len += pad_len

        # Step 2: Divide input into chunks
        num_chunks = seq_len // self.chunk_size
        x = x.view(batch_size, num_chunks, self.chunk_size, embed_dim)  # (batch, num_chunks, chunk_size, embed_dim)

        # Step 3: Apply self-attention to each chunk
        x = x.reshape(batch_size * num_chunks, self.chunk_size, embed_dim)  # (batch * num_chunks, chunk_size, embed_dim)
        x = self.self_attention(x)  # Apply self-attention
        x = x.reshape(batch_size, num_chunks, self.chunk_size, embed_dim)  # Reshape back

        # Step 4: Combine chunks back into sequence
        x = x.reshape(batch_size, seq_len, embed_dim)  # Reshape back

        return x[:, :seq_len - pad_len, :]  # Remove padding

# ✅ LayerNorm Wrapper
class LayerNorm(nn.Module):
    def __init__(self, embed_dim):
        super(LayerNorm, self).__init__()
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        if x.dtype != torch.float32:
            x = x.to(torch.float32)  # Ensure input is float32
        return self.norm(x)

# ✅ Recurrent Memory (Fixed)
class RecurrentMemory(nn.Module):
    def __init__(self, embed_dim, memory_size, num_heads):  # Add num_heads as a parameter
        super(RecurrentMemory, self).__init__()
        self.memory_size = memory_size
        self.num_heads = num_heads  # Store num_heads as an attribute
        self.memory_bank = nn.Parameter(torch.zeros(memory_size, embed_dim, dtype=torch.float32))  # Use float32
        self.memory_norm = LayerNorm(embed_dim)  # Normalize memory output
        self.memory_attention = nn.MultiheadAttention(embed_dim, num_heads=num_heads, batch_first=True)  # Use num_heads

    def forward(self, x):
        batch_size, seq_len, embed_dim = x.shape

        # Step 1: Memory Read (Multi-query attention)
        memory_expanded = self.memory_bank.unsqueeze(0).expand(batch_size, -1, -1)  # (batch, memory_size, embed_dim)
        memory_output, _ = self.memory_attention(x, memory_expanded, memory_expanded)  # (batch, seq_len, embed_dim)
        memory_output = self.memory_norm(memory_output)  # Normalize memory output

        # Step 2: Integrate memory output into main stream
        x = x + memory_output  # Add memory output to input

        # Step 3: Memory Write (Update memory bank)
        self.update_memory(x)

        return x

    def update_memory(self, x):
        with torch.no_grad():
            # Aggregate information for update
            update_value = x.mean(dim=1).mean(dim=0)

            # Create a new memory bank with shifted values
            updated_memory_bank = torch.roll(self.memory_bank, shifts=-1, dims=0)
            updated_memory_bank[-1] = update_value

            # Update the memory bank using a non-in-place operation
            self.memory_bank.data = updated_memory_bank  # Assign to .data to avoid in-place operation


# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512
num_heads = 8
chunk_size = 32
memory_size = 10

# ✅ Initialize Modules
multi_head_attention = MultiHeadSelfAttention(embed_dim, num_heads).to(device)
chunked_attention = ChunkedAttention(embed_dim, num_heads, chunk_size).to(device)
layer_norm = LayerNorm(embed_dim).to(device)
# ✅ Initialize Recurrent Memory
recurrent_memory = RecurrentMemory(embed_dim, memory_size, num_heads).to(device)
# ✅ Example Input (Random Token Embeddings)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Apply Multi-Head Self-Attention
output_multi_head_attention = multi_head_attention(input_embeddings)

# ✅ Apply Chunked Attention
output_chunked_attention = chunked_attention(input_embeddings)

# ✅ Apply LayerNorm
output_layer_norm = layer_norm(input_embeddings)

# ✅ Apply Recurrent Memory
output_recurrent_memory = recurrent_memory(input_embeddings)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Multi-Head Self-Attention Output Shape:", output_multi_head_attention.shape)  # Expected: (8, 128, 512)
print("✅ Chunked Attention Output Shape:", output_chunked_attention.shape)  # Expected: (8, 128, 512)
print("✅ LayerNorm Output Shape:", output_layer_norm.shape)  # Expected: (8, 128, 512)
print("✅ Recurrent Memory Output Shape:", output_recurrent_memory.shape)  # Expected: (8, 128, 512)

✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ Multi-Head Self-Attention Output Shape: torch.Size([8, 128, 512])
✅ Chunked Attention Output Shape: torch.Size([8, 128, 512])
✅ LayerNorm Output Shape: torch.Size([8, 128, 512])
✅ Recurrent Memory Output Shape: torch.Size([8, 128, 512])


In [129]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# from FeedforwardNetwork import LayerNorm,MultiHeadSelfAttention,ResidualConnection,FeedforwardNetwork

device = 'cuda' if torch.cuda.is_available() else 'cpu'




class RewardModel(nn.Module):
    def __init__(self, embed_dim):
        super(RewardModel, self).__init__()
        self.linear1 = nn.Linear(embed_dim, 256)  # First linear layer
        self.linear2 = nn.Linear(256, 1)  # Second linear layer
        self.activation = nn.ReLU()  # Activation function
        self.dropout = nn.Dropout(0.1)  # Dropout for regularization

    def forward(self, x):
        x = self.activation(self.linear1(x))  # Apply first linear layer and activation
        x = self.dropout(x)  # Apply dropout
        x = self.linear2(x).float()  # Apply second linear layer and ensure float32 output
        return x  # Shape: (batch, seq_len, 1)
class PPOOptimizer:
    def __init__(self, model, reward_model, lr=1e-4, gamma=0.99, clip_epsilon=0.2, entropy_coef=0.01):
        self.model = model
        self.reward_model = reward_model
        self.optimizer = optim.Adam(model.parameters(), lr=lr)  # Adam optimizer
        self.gamma = gamma  # Discount factor
        self.clip_epsilon = clip_epsilon  # Clipping parameter for PPO
        self.entropy_coef = entropy_coef  # Entropy coefficient

    def compute_advantages(self, rewards, values):
        """
        Compute advantages using Generalized Advantage Estimation (GAE).
        """
        advantages = torch.zeros_like(rewards)  # Initialize advantages
        last_advantage = 0  # Initialize last advantage

        # Vectorized GAE computation
        for t in reversed(range(len(rewards))):
            next_value = values[t + 1] if t < len(rewards) - 1 else 0  # Handle last timestep
            delta = rewards[t] + self.gamma * next_value - values[t]  # Compute delta
            advantages[t] = delta + self.gamma * last_advantage  # Update advantages
            last_advantage = advantages[t]  # Update last advantage

        return advantages

    def update(self, states, actions, rewards, old_log_probs, values):
        print("🔍 PPO Update Start")

        advantages = self.compute_advantages(rewards, values)
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        print(f"🔹 Advantages: {advantages.shape}")

        # Get outputs from the model
        output, new_log_probs, _, new_values, _ = self.model(states, actions)  # Unpack to match model output

        # No need to call model again
        # new_log_probs, new_values, entropy = self.model(states, actions)

        print(f"✅ Log Probs: {new_log_probs.shape}, ✅ Values: {new_values.shape}") # , ✅ Entropy: {entropy.shape}

        ratio = torch.exp(new_log_probs - old_log_probs)
        policy_loss = -torch.min(ratio * advantages,
                                 torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages).mean()

        returns = advantages + values
        value_loss = F.mse_loss(new_values, returns)

        # entropy_loss = -entropy.mean() # entropy not calculated
        loss = policy_loss + 0.5 * value_loss # + self.entropy_coef * entropy_loss # entropy_loss removed

        print(f"📉 Loss: {loss.item()}")
        self.loss = loss

        self.optimizer.zero_grad()
        loss.backward(retain_graph = True)
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
        self.optimizer.step()

        print("✅ PPO Update Done!")


# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512

# ✅ Initialize Reward Model
reward_model = RewardModel(embed_dim).to(device)

# ✅ Example Input (Random Token Embeddings)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Apply Reward Model
rewards = reward_model(input_embeddings)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Rewards Shape:", rewards.shape)  # Expected: (8, 128, 1)
print("✅ Rewards dtype:", rewards.dtype)  # Expected: float32





✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ Rewards Shape: torch.Size([8, 128, 1])
✅ Rewards dtype: torch.float32


In [130]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# from FeedforwardNetwork import LayerNorm, MultiHeadSelfAttention, ResidualConnection, FeedforwardNetwork
# from chunkMemory import ChunkedAttention, RecurrentMemory
# from Layer3.second_of_layer3 import PPOOptimizer

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Set random seed for reproducibility
torch.manual_seed(42)


class Layer3WithContextAndRL(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_heads, memory_size, dropout=0.1, num_actions=10):
        super(Layer3WithContextAndRL, self).__init__()
        # Sublayers
        self.pre_layer_norm = LayerNorm(embed_dim)  # Pre-LayerNorm
        self.self_attention = MultiHeadSelfAttention(embed_dim, num_heads)  # Self-Attention
        self.residual1 = ResidualConnection(embed_dim, dropout)  # Residual Connection 1
        self.layer_norm1 = LayerNorm(embed_dim)  # LayerNorm after Self-Attention

        # Context Handling
        self.chunked_attention = ChunkedAttention(embed_dim, num_heads)  # Chunked Attention
        self.recurrent_memory = RecurrentMemory(embed_dim, memory_size, num_heads)  # Recurrent Memory

        # Feedforward Network
        self.ffn = FeedforwardNetwork(embed_dim, hidden_dim, dropout)  # FFN
        self.residual2 = ResidualConnection(embed_dim, dropout)  # Residual Connection 2
        self.layer_norm2 = LayerNorm(embed_dim)  # LayerNorm after FFN

        # RL Integration
        self.reward_model = nn.Linear(embed_dim, 1)  # Reward Model
        self.policy_network = nn.Linear(embed_dim, num_actions)  # Policy Network
        self.value_network = nn.Linear(embed_dim, 1)  # Value Network

        # PPO Optimizer
        self.ppo_optimizer = PPOOptimizer(model=self, reward_model=self.reward_model)  # Initialize PPO Optimizer

    def forward(self, x, actions=None):
        # Sublayer 1: Self-Attention with Residual Connection
        x = self.residual1(x, self.self_attention)
        x = self.layer_norm1(x)

        # Context Handling
        x = self.chunked_attention(x)
        x = self.recurrent_memory(x)

        # Sublayer 2: Feedforward Network with Residual Connection
        x = self.residual2(x, self.ffn)
        x = self.layer_norm2(x)

        # RL Integration: Compute Rewards
        rewards = self.reward_model(x)  # Shape: (batch_size, seq_len, 1)

        # Policy Network (Action Selection)
        last_hidden_state = x[:, -1, :]  # Use the last hidden state for action selection
        logits = self.policy_network(last_hidden_state)
        probs = F.softmax(logits, dim=-1)

        # If actions are not provided, sample from policy
        if actions is None:
            actions = torch.multinomial(probs, num_samples=1).squeeze(-1)

        # Compute log probs for selected actions
        new_log_probs = torch.log(probs.gather(-1, actions.unsqueeze(-1))).squeeze(-1)

        # Value Network (State Value Estimation)
        values = self.value_network(last_hidden_state).squeeze(-1)

        return x, new_log_probs, actions, values, rewards  # Return all five values





# ✅ Hyperparameters
batch_size = 8
seq_len = 128
embed_dim = 512
hidden_dim = 2048  # FFN hidden dimension
num_heads = 8
memory_size = 100  # Memory bank size
num_actions = 10  # Number of actions

# ✅ Initialize Layer 3 with Context and RL
layer3_with_context_and_rl = Layer3WithContextAndRL(
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_heads=num_heads,
    memory_size=memory_size,
    num_actions=num_actions
).to(device)

# ✅ Example Input (Layer 2 ka output)
input_embeddings = torch.randn(batch_size, seq_len, embed_dim, dtype=torch.float32, device=device)

# ✅ Apply Layer 3 with Context and RL
output, new_log_probs, actions, values, rewards = layer3_with_context_and_rl(input_embeddings)

# ✅ Debugging Info
print("✅ Input Embeddings Shape:", input_embeddings.shape)  # Expected: (8, 128, 512)
print("✅ Layer 3 Output Shape:", output.shape)  # Expected: (8, 128, 512)
print("✅ New Log Probs Shape:", new_log_probs.shape)  # Expected: (8,)
print("✅ Actions Shape:", actions.shape)  # Expected: (8,)
print("✅ Values Shape:", values.shape)  # Expected: (8,)
print("✅ Rewards Shape:", rewards.shape)  # Expected: (8, 128, 1)
print("✅ Output dtype:", output.dtype)  # Expected: float32

✅ Input Embeddings Shape: torch.Size([8, 128, 512])
✅ Layer 3 Output Shape: torch.Size([8, 128, 512])
✅ New Log Probs Shape: torch.Size([8])
✅ Actions Shape: torch.Size([8])
✅ Values Shape: torch.Size([8])
✅ Rewards Shape: torch.Size([8, 128, 1])
✅ Output dtype: torch.float32


In [131]:
import os
import torch
from torch.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
import json
from pathlib import Path

# Enable synchronous CUDA execution for better debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for training 🔥")

# Import custom modules
# from TokenizationLayer import TokenizationLayer
# from TokenEmbedding import TokenEmbedding
# from Layer1Stack import Layer1
# from Layer2WithMemory import Layer2WithMemory
# from Layer3.third_of_layer3_stacking import Layer3WithContextAndRL


# ----------------------------
# 1. Dataset Class - Handles both text and JSON files
# ----------------------------
class BhaiDataset(Dataset):
    def __init__(self, file_path):
        self.file_path = Path(file_path)
        self.data = self._load_data()

    def _load_data(self):
        # Load data based on file type
        if self.file_path.suffix == '.txt':
            return self._load_txt()
        elif self.file_path.suffix == '.json':
            return self._load_json()
        else:
            raise ValueError("Bhai, only .txt or .json files are supported!")

    def _load_txt(self):
        with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f:
            return [self._clean(line) for line in f if line.strip()]

    def _load_json(self):
        with open(self.file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            return self._parse_json(data)

    def _clean(self, text):
        # Clean text by removing unwanted characters
        bad_chars = {'\x00', '\ufffd', '�', '\r'}
        return ''.join(c for c in text if c not in bad_chars).strip()

    def _parse_json(self, data):
        # Parse JSON data (supports both simple and nested JSON)
        texts = []
        if isinstance(data, list):
            for item in data:
                if 'text' in item:
                    texts.append(self._clean(item['text']))
                elif 'content' in item:
                    texts.append(self._clean(item['content']))
        elif isinstance(data, dict):
            for key, value in data.items():
                if isinstance(value, str):
                    texts.append(self._clean(value))
        return texts

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


# ----------------------------
# 2. Training Loop (GPU optimized)
# ----------------------------
def bhai_trainer(dataset_path, epochs=10):
    # Initialize dataset and tokenizer
    dataset = BhaiDataset(dataset_path)
    tokenizer = TokenizationLayer()

    # DataLoader with smart collate function
    def collate_fn(batch):
        tokens = tokenizer.tokenize(batch, max_length=128)  # Reduced sequence length
        return tokens.to(device)

    dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)  # Reduced batch size

    # Initialize model components
    embedder = TokenEmbedding(vocab_size=tokenizer.vocab_size, embed_dim=256).to(device)
    layer1 = Layer1(256, 1024, 8).to(device)
    layer2 = Layer2WithMemory(256, 1024, 8, 100).to(device)
    layer3 = Layer3WithContextAndRL(256, 1024, 8, 100).to(device)

    # Optimizer and AMP setup
    optimizer = layer3.ppo_optimizer.optimizer
    scaler = GradScaler()  # Correct initialization

    # Training loop
    for epoch in range(epochs):
        for batch_idx, inputs in enumerate(dataloader):
            with autocast(device_type = 'cuda'):  # Mixed precision
                # Forward pass
                emb = embedder(inputs)
                print(f"Embeddings shape: {emb.shape}")  # Debugging
                l1 = layer1(emb)
                print(f"Layer 1 output shape: {l1.shape}")  # Debugging
                l2 = layer2(l1)
                print(f"Layer 2 output shape: {l2.shape}")  # Debugging
                l3, log_probs, actions, values, rewards = layer3(l2)  # Unpack all five values
                print(f"Layer 3 output shape: {l3.shape}")  # Debugging

                # PPO update
                layer3.ppo_optimizer.update(
                    states=l2,
                    actions=actions,
                    rewards=rewards.mean(dim=1),  # Use rewards from layer3
                    old_log_probs=log_probs.detach(),
                    values=values
                )

            loss = layer3.ppo_optimizer.loss

            # Backpropagation
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            # Print progress
            if batch_idx % 10 == 0:
                gpu_mem = torch.cuda.memory_allocated() // 1024 ** 2
                print(f"Epoch {epoch + 1} | Batch {batch_idx} | Loss: {loss.item():.2f} | GPU Mem: {gpu_mem}MB")

        # Save model checkpoint
        torch.save({
            'layer1': layer1.state_dict(),
            'layer2': layer2.state_dict(),
            'layer3': layer3.state_dict(),
            'embedder': embedder.state_dict()
        }, f"bhai_llm_epoch_{epoch + 1}.pt")


# ----------------------------
# 3. Main Execution
# ----------------------------
if __name__ == "__main__":
    import argparse

    # Set up argument parser
    parser = argparse.ArgumentParser(description='Bhai ka LLM Trainer')
    parser.add_argument('--dataset', type=str, required=True, help='Path to .txt or .json file')
    parser.add_argument('--epochs', type=int, default=10, help='Number of epochs')

    # For testing, hardcode the dataset path
    args = parser.parse_args([
        '--dataset', '/content/100.txt',  # Replace with your dataset path
        '--epochs', '10'
    ])

    # Start training
    print(f"\nBhai, training shuru kar raha hoon 🚀")
    print(f"Dataset: {args.dataset}")
    print(f"Epochs: {args.epochs}\n")

    bhai_trainer(args.dataset, args.epochs)

Using cpu for training 🔥

Bhai, training shuru kar raha hoon 🚀
Dataset: /content/100.txt
Epochs: 10



  value_loss = F.mse_loss(new_values, returns)


Embeddings shape: torch.Size([2, 128, 256])
Layer 1 output shape: torch.Size([2, 128, 256])
Layer 2 output shape: torch.Size([2, 128, 256])
Layer 3 output shape: torch.Size([2, 128, 256])
🔍 PPO Update Start
🔹 Advantages: torch.Size([2, 1])
✅ Log Probs: torch.Size([2]), ✅ Values: torch.Size([2])
📉 Loss: 0.5688914060592651
✅ PPO Update Done!


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [256, 1]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [None]:
import torch
import torch.nn.functional as F

def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, top_k=50, top_p=0.95):
    """
    Generate text using a trained model with temperature scaling, top-k filtering, and top-p sampling.

    Args:
        model: The trained model.
        tokenizer: The tokenizer for encoding/decoding text.
        prompt: The input prompt (string).
        max_length: Maximum length of the generated sequence.
        temperature: Temperature for scaling logits (higher = more random).
        top_k: Top-k filtering (0 to disable).
        top_p: Top-p (nucleus) sampling (0.0 to disable).

    Returns:
        generated_text: The generated text (string).
    """
    model.eval()  # Set model to evaluation mode
    device = next(model.parameters()).device  # Get the device of the model

    # Tokenize input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    generated_ids = input_ids.clone()  # Store generated tokens

    with torch.no_grad():  # Disable gradient calculation
        for _ in range(max_length):
            # Forward Pass
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]  # Get logits for the last token

            # Apply temperature scaling
            if temperature != 1.0:
                logits = logits / temperature

            # Top-k Filtering
            if top_k > 0:
                top_k_values, _ = torch.topk(logits, top_k)
                logits[logits < top_k_values[:, -1].unsqueeze(-1)] = -float('inf')

            # Top-p (Nucleus) Sampling
            if top_p > 0.0:
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0
                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = -float('inf')

            # Sample next token
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            # Append generated token to input
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            generated_ids = torch.cat([generated_ids, next_token], dim=-1)

            # Stop if end-of-sequence token is generated
            if next_token.item() == tokenizer.eos_token_id:
                break

    # Decode generated tokens to text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Generate text
prompt = "Once upon a time"
generated_text = generate_text(model, tokenizer, prompt, max_length=50, temperature=0.7, top_k=50, top_p=0.9)
print(generated_text)