<a href="https://colab.research.google.com/github/Zhehao-CUI/DL_projet/blob/main/DL_projet_Zhehao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Import
import math, time, random, os
import torch
import torch.nn as nn
import torch.nn.functional as F

# 2. Reproducibility + Device
"""
Setting the seed for reproducibility.
Setting the device to 'cuda' if available, otherwise 'cpu'.
"""
seed = 1228
random.seed(seed) # set the seed for python's build-in random module
torch.manual_seed(seed) # set the seed for pythorch's random number generator on the CPU
torch.cuda.manual_seed_all(seed) # set the seed for pytorch CUDA random number generator for all GPUs
device = 'cuda' if torch.cuda.is_available() else 'cpu' # choose the device
print("device:", device)

if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True


device: cuda


  self.setter(val)


In [2]:
# 3. Download the dataset
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O input.txt # ! -> run a shell command to download the text file from site -o: output of command is "input.text"
text = open("input.txt", "r", encoding="utf-8").read()
print("dataset chars:", len(text))
print(text[:200])

dataset chars: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [3]:
# 4. Char tokenizer
"""
creats a tokenizer to tokenize the text into characters
each character is mapped to an integer ID using stoi
for a given character, stoi returns the corresponding integer ID and encode() produces a 1D tensor of those IDs
"""
chars = sorted(list(set(text))) # collect all unique characters in the dataset, converts the set to a list and sort the list
print(chars[:10])
vocab_size = len(chars) # number of unique characters
stoi = {ch:i for i,ch in enumerate(chars)} # map each char (ch) to an integer index i
itos = {i:ch for ch,i in stoi.items()} # inverse mapping

def encode(s: str) -> torch.Tensor:
  """
  encodes a string into a tensor of integers
  """
  return torch.tensor([stoi[c] for c in s], dtype=torch.long)

def decode(ids) -> str:
  """
  decodes a tensor of integers into a string
  """
  if isinstance(ids, torch.Tensor):
      ids = ids.tolist()
  return "".join(itos[i] for i in ids)

data = encode(text) # convert the entire dataset text into a 1D tensor of char id
n = int(0.9 * len(data)) # split the training and text data (90% vs. 10%)
train_data = data[:n]
val_data   = data[n:]
print("vocab_size:", vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']
vocab_size: 65


In [7]:

# 5. Training hyperparams
"""
define the hyperparameters for training the model
"""
block_size    = 128   # context length : the model looks at 128 chars at a time to predict the next char
batch_size    = 64    # number of sequence processed (64 X 128 = 8192 tokens) processed in parallel per training step
max_iters     = 12000 # how many times we update those weight during training
eval_interval = 500   # every 500 steps, we evaluate the train/val loss averages
eval_iters    = 100   # when evaluating, average loss over 100 batches to reduce noise

learning_rate = 3e-4  # learning rate (how big each parameter update step is)


# model size : 12 layers, 8 heads, 768 embd
n_embd  = 768
n_head  = 8
n_layer = 12
dropout = 0.1
batch_size = 32

In [4]:
# 6. Batch sampling (sliding window)
"""
creates a batch of batch_size sequences, and each sequence has length block_size = 128.
"""
def get_batch(split: str):
    src = train_data if split == "train" else val_data
    ix = torch.randint(0, len(src) - block_size - 1, (batch_size,)) # pick random starting position
    x = torch.stack([src[i:i+block_size] for i in ix])          # X (batch_size, block_size) = input
    y = torch.stack([src[i+1:i+block_size+1] for i in ix])      # Y (batch_size, block_size) = target sequence (one character ahead)
                                                                # -> given the characters up to position t, predict the next character
    return x.to(device), y.to(device)

In [19]:
# 7. GPT model
class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head, dropout):
        super().__init__()
        assert n_embd % n_head == 0
        self.n_head = n_head
        self.head_dim = n_embd // n_head # split the embedding dimension C=n_embd into n_head heads

        self.qkv  = nn.Linear(n_embd, 3 * n_embd, bias=False) # single linear layer that output queries, keys and values (batch_size, block_size, n_embd)
        self.proj = nn.Linear(n_embd, n_embd, bias=False) #  output projection layer in the attention module
        self.dropout = nn.Dropout(dropout) # set a fraction of the elements to 0

    def forward(self, x):
        """
        forward pass for the CausalSelfAttention module
        input x: (batch_size, block_size, n_embd) -> token embedding x
        output y: (batch_size, block_size, n_embd) -> transformed token embedding
        """
        B, T, C = x.shape
        qkv = self.qkv(x)
        q, k, v = qkv.split(C, dim=-1) #split the last dimension into 3 chunks of size C

        #reshape into multiple heads
        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (batch_size, n_head, T, head_dim)
        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)

        # compute causal attention
        y = F.scaled_dot_product_attention(
            q, k, v,
            attn_mask=None,
            dropout_p=self.dropout.p if self.training else 0.0,
            is_causal=True # token at position t can only attend to positions <= t
        )  # (B,nh,T,hd)
        # merge head back
        y = y.transpose(1, 2).contiguous().view(B, T, C)          # (B,T,C)
        y = self.dropout(self.proj(y)) # output projection + dropout
        return y

class MLP(nn.Module):
    """
    feed-forward network with two linear layers and a GELU activation function
    """
    def __init__(self, n_embd, dropout):
        super().__init__()
        self.fc1 = nn.Linear(n_embd, 4 * n_embd) # first linear layer expands the dimension: C to 4C
        self.fc2 = nn.Linear(4 * n_embd, n_embd) # second linear layer  projects back down: 4C to C
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.fc2(F.gelu(self.fc1(x)))) # x -> fc1 -> GELU -> fc2 -> dropout

class Block(nn.Module):
    """
    Transformer decoder block : LayerNorm -> Causal self-attention/MLP -> residual add
    """
    def __init__(self, n_embd, n_head, dropout):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = CausalSelfAttention(n_embd, n_head, dropout)
        self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = MLP(n_embd, dropout)

    def forward(self, x):
        # required form:
        x = x + self.attn(self.ln1(x)) # residual add : original x + the attention "update"
        x = x + self.mlp(self.ln2(x)) # residual add : original x + MLP update
        return x

class GPT(nn.Module):
    def __init__(self, vocab_size, block_size, n_embd, n_head, n_layer, dropout):
        super().__init__()
        self.block_size = block_size
        self.wte = nn.Embedding(vocab_size, n_embd)      # token embeddings (WTE) : map each token ID to a vector of length n_embd
        self.wpe = nn.Embedding(block_size, n_embd)      # position embeddings (WPE) : give each position its own embedding vector
        self.drop = nn.Dropout(dropout)

        self.blocks = nn.ModuleList([Block(n_embd, n_head, dropout) for _ in range(n_layer)]) # creates n_layer identical Transformer decoder blocks.
        self.ln_f = nn.LayerNorm(n_embd)                 # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # converts hidden state vecotr into logit over vocabulary

        self.apply(self._init_weights) # initialize weights with a normal distribution

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        """
        forward pass for the GPT model :
          1. predicted logits for the next token at every position
          2. training loss
        """
        B, T = idx.shape
        assert T <= self.block_size, "Sequence length exceeds block_size"

        pos = torch.arange(0, T, device=idx.device) # creat positions indices
        tok_emb = self.wte(idx)              # WTE(idx) (B,T) -> (B,T,C)
        pos_emb = self.wpe(pos)              # WPE(pos) (T,)  -> (T,C)
        x = self.drop(tok_emb + pos_emb)     # Dropout(tok_emb + pos_emb) # (B,T,C)

        for blk in self.blocks:
            x = blk(x) # pass x through n_layer decoder blocks

        x = self.ln_f(x)
        logits = self.lm_head(x) # unnormalized score for each vocabulary token at each positions

        loss = None # computing loss
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, greedy=False):
        """
        generate new text tokens
        """
        self.eval() # put the model to the evaluation mode
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:] # (B, â‰¤block_size)
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / max(temperature, 1e-8) # (B, V)

            if top_k is not None:
                """
                keeps only the top k highest logits per batch row
                """
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float("inf")

            probs = F.softmax(logits, dim=-1) # raw scores -> probability
            # choose the next token
            if greedy:
                next_id = torch.argmax(probs, dim=-1, keepdim=True)
            else:
                next_id = torch.multinomial(probs, num_samples=1)

            idx = torch.cat([idx, next_id], dim=1)
        return idx


In [15]:
# 8. Init model + optimizer adam
model = GPT(vocab_size, block_size, n_embd, n_head, n_layer, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  # REQUIRED

# AMP
use_amp = (device == "cuda")
scaler = torch.amp.GradScaler("cuda", enabled=use_amp)

@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ["train", "val"]: # for both training and validation set : compute the loss and average
        losses = []
        for _ in range(eval_iters):
            x, y = get_batch(split)
            with torch.amp.autocast("cuda", enabled=use_amp):
                _, loss = model(x, y)
            losses.append(loss.item())
        out[split] = sum(losses) / len(losses)
    model.train()
    return out


In [21]:

# 9. Train
t0 = time.time()
for it in range(1, max_iters + 1):
    x, y = get_batch("train")

    optimizer.zero_grad(set_to_none=True) # clear old gradient
    with torch.amp.autocast("cuda", enabled=use_amp): #foward pass
        _, loss = model(x, y)

    scaler.scale(loss).backward() # backprop
    scaler.step(optimizer)   # actually updates parameters
    scaler.update()          # updates scaler for next iteration

    if it % eval_interval == 0:
        losses = estimate_loss()
        val_ppl = math.exp(losses["val"])
        print(f"iter {it}/{max_iters} | train {losses['train']:.4f} | val {losses['val']:.4f} | ppl {val_ppl:.2f}")

iter 500/12000 | train 2.0636 | val 2.1295 | ppl 8.41
iter 1000/12000 | train 1.6241 | val 1.7974 | ppl 6.03
iter 1500/12000 | train 1.4255 | val 1.6289 | ppl 5.10
iter 2000/12000 | train 1.3143 | val 1.5742 | ppl 4.83
iter 2500/12000 | train 1.2500 | val 1.5217 | ppl 4.58
iter 3000/12000 | train 1.1824 | val 1.5093 | ppl 4.52
iter 3500/12000 | train 1.1330 | val 1.5292 | ppl 4.61
iter 4000/12000 | train 1.0670 | val 1.5245 | ppl 4.59
iter 4500/12000 | train 1.0079 | val 1.5529 | ppl 4.72
iter 5000/12000 | train 0.9350 | val 1.6171 | ppl 5.04
iter 5500/12000 | train 0.8594 | val 1.6398 | ppl 5.15
iter 6000/12000 | train 0.7672 | val 1.7129 | ppl 5.55
iter 6500/12000 | train 0.6767 | val 1.7675 | ppl 5.86
iter 7000/12000 | train 0.5797 | val 1.8740 | ppl 6.51
iter 7500/12000 | train 0.5010 | val 2.0120 | ppl 7.48
iter 8000/12000 | train 0.4216 | val 2.1060 | ppl 8.22
iter 8500/12000 | train 0.3644 | val 2.2090 | ppl 9.11
iter 9000/12000 | train 0.3098 | val 2.3424 | ppl 10.41
iter 9500/

In [23]:
# 10. test
seed_str = "O God, O God!"
context = encode(seed_str).unsqueeze(0).to(device)
with torch.no_grad():
    out = model.generate(context, max_new_tokens=800, temperature=0.9, top_k=50, greedy=False)
print(decode(out[0]))

O God, O God! that e'er this tongue
But that raised him to the king entertain,
This satisfaction made the day of Juliet.

JULIET:
I will confirm thee to the extremest point.

ROMEO:
That art thou worthy then well for this action.

MERCUTIO:
Nay, I'll conjure too.
Romeo! humours! madman! passion! lover!
Appear thou in the likeness of a sigh:
Speak but one rhyme, and I am satisfied;
Cry but 'Ay me!' pronounce but 'love' and 'dove;'
Speak to my gossip Venus one fair word,
One nick-name for her persons.

VIRGILIA:
No, good madam; I will not over the
threshold till my lord return from the wars.

VALERIA:
Fie, you confine yourself most unreasonably: come, lend you thither.

VIRGILIA:

MISTRESS OVERDONE:
Why, what's that?

BUCKINGHAM:
Marry, my lord, will not prove it.

PARIS:
Do not you come to make your sta
