# Simple GPT Structure Analysis

## Import libraries

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

## Hyperparameters

In [None]:
batch_size = 64 # number of independent sequences process in parallel
block_size = 256 # maximum context length for predictions
max_iters = 5000 # max number of iteration
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200 # number of evaluation iteration 
n_embd = 384 # dimension of the embedding 
n_head = 6 # 6 heads/ 6 experts for learning
n_layer = 6 # 6 attention layers
dropout = 0.2 # this is for stablizing the prediction prevents overfitting

Set the random seed

In [None]:
torch.manual_seed(1337)

## Input Text Processing
### Create encoder and decoder
    - By sort all the characters in the text and match them with length, have each charater match with a number, and that's how to encode the text, reversely, that's how decode the text




In [None]:
# Open the text
with open('input.txt','r',encoding = 'utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

### Train and test splits

In [None]:
# test and split
data = torch.tensor(encode(text),dtype = torch.long)
n = int(0.9*len*data)
train_data = data[:n]
val_data = data[n:]


### Create Batch

In [None]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    # generate a tensor of random integers from 0(inclusive) up to len(data)-block_size, size is the number of batch_size we wish to process in parall
    ix = torch.randint(len(data)-block_size, (batch_size))
    x = torch.stack([data[i:i+block_size] for i in ix])
    # ys always have one 'futrue' element where x is trying to predict
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y


## Training Loop

### Etimate_Loss:
- in the case here, we will estimate loss 200 times
- You average the mean of losses for stablization purpose

In [None]:
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean
    model.train()
    return out

### Head
- **key, qeury, query** are defined by linear layer, because they refers to transformations to project the input embeddings into different vector space, and then  we can calculate the relatipnships ('the attention scores') between tokens.

- **query**: ask questions about the input
- **key**: answers or information about each input element
- **value**: contains actual data or features that will be aggregated 
- **nn.Linear**: initialize each layer with its own set of weights
- **mask**: ensure that the model only attends to previous toens and itself in the sequence.
- **softmax**: The reason why we apply softmax ***after mask*** is to ensure masked positions contribute zero probability

In [None]:
class Head(nn.Moduel):
    """one head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        # key, query, and value are just analogy
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        # compute attention scores
        # k.shape[-1]: scaling factor, 1/sqrt(head_size)
        # transpose(-2,-1) means to swaps the last two dumensinos of the tensor
        wei = q @ k.transpose(-2,-1)* k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        # converts the attention scores into probabilities
        wei = F.softmax(wei,dim = -1)
        wei = self,dropout(wei)
        v = self.value(x)
        # weighted sum of 'v': weighted average of the value vectors
        out = wei @ v
        return out

### MultiHead:


In [None]:
class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel"""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList(Head(head_size) for _in range(num_heads))
        self.proj =  nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        # concatenation should happen along the last dimension of the tensors in the list
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

### MLP layer (extract from memory):


In [None]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Lienar(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

### Block: bouding together


In [None]:
class Block(nn.Module):
    """Transformer block: communication followed by computation"""
    
    def __init__(self, n_embd,n_head):
        head_size = n_embd // n_head
        self