In [None]:
!pip install gensim

In [None]:
! pip3 install tiktoken

In [None]:
import importlib
import tiktoken

In [None]:
with open("/content/drive/MyDrive/Building a LLM/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))

##Building The Input Target Pairs

---



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

class LLMDataSetVersion1(Dataset):
  def __init__(self,text,tokenizer,max_length,stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]



def DataLoaderVersion1(text, batch_size = 4, max_length = 256, stride = 128,
                         shuffle = True, drop_last = True, num_workers = 0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = LLMDataSetVersion1(text, tokenizer, max_length, stride)

  dataloader = DataLoader(
      dataset,
      batch_size = batch_size,
      shuffle = shuffle,
      drop_last = drop_last,
      num_workers = num_workers
  )

  return dataloader


dataloader = DataLoaderVersion1(raw_text, batch_size = 1, max_length = 4, stride = 4, shuffle = False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

##Building The Input Embeddings




In [None]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

In [None]:
vocab_size = len(encoded_text)
output_dimension = 256
context_length = 4

dataloader = DataLoaderVersion1(raw_text, batch_size = 8, max_length = context_length, stride = context_length, shuffle = False)
data_iter = iter(dataloader)
input,target = next(data_iter)

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dimension)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dimension)

In [None]:
#What a sample input matrix looks like
#Our goal over here is to convert each of these token IDS into a 256 dimension input vector for our neural network
#We do this with the addition of the token_embedding_layer and the pos_embedding_layer
input

In [None]:
pos_embedding = pos_embedding_layer(torch.arange(context_length))
pos_embedding

In [None]:
target

In [None]:
print("Your journey starts with one step")

In [None]:
#Implementing a Simplifed Attention Mechanism

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # Your     (x^1)
     [0.55, 0.87, 0.66],  # journey  (x^2)
     [0.57, 0.85, 0.64],  # starts   (x^3)
     [0.22, 0.58, 0.33],  # with     (x^4)
     [0.77, 0.25, 0.10],  # one      (x^5)
     [0.05, 0.80, 0.55]]  # step     (x^6)
)


x_2 = inputs[1]         # A
d_in = inputs.shape[1]  # B
d_out = 2               # C

W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value

In [None]:
W_query

In [None]:
attn_scores = queries @ keys.T
attn_scores

In [None]:
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
print(attn_weights)

In [None]:
#Coding Out The Dropout Functionailty
import torch

In [None]:
example = torch.ones(6,6)
print(example)

In [None]:
torch.manual_seed(23)
dropout= torch.nn.Dropout(0.5)
print(dropout(example))

**Implementing Multi-Head Attention**

In [None]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [None]:
print("HI")

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [None]:
!git clone https://[REDACTED_TOKEN]@github.com/Zidane-Virani/lm.git

In [None]:
!git push origin attention-mechanism

In [None]:
class DummyTransformerBlock(nn.Module):
  def_init__(self, config):