In [None]:
%pip install fancy_einsum
%pip install einops

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-11-27 19:34:46--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-11-27 19:34:46 (25.0 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [None]:
%%writefile TASK_3_2DDP_Multiple_GPU.py


import os
# Set environment variables
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

import matplotlib.pyplot as plt
import einops
import datasets
import transformers
from fancy_einsum import einsum
import torch
import torch.nn as nn
import numpy as np
import math
import tqdm.auto as tqdm
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group,destroy_process_group
import os
import torch.distributed as dist

with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Configuration:
    def __init__(self, dim_model=768, debug=False, layer_norm_eps=1e-5,
                 dim_vocab=50257, init_range=0.02, n_ctx=1024,
                 dim_head=64, dim_mlp=3072, n_heads=12, n_layers=12):
        self.dim_model = dim_model
        self.debug = False
        self.layer_norm_eps = layer_norm_eps
        self.dim_vocab = dim_vocab
        self.init_range = init_range
        self.n_ctx = n_ctx
        self.dim_head = dim_head
        self.dim_mlp = dim_mlp
        self.n_heads = n_heads
        self.n_layers = n_layers


    def __repr__(self):
        return (f"Config(dim_model={self.dim_model}, debug={self.debug}, "
                f"layer_norm_eps={self.layer_norm_eps}, dim_vocab={self.dim_vocab}, "
                f"init_range={self.init_range}, n_ctx={self.n_ctx}, "
                f"d_head={self.dim_head}, dim_mlp={self.dim_mlp}, n_heads={self.n_heads}, "
                f"n_layers={self.n_layers})")


cfg = Configuration()
# print(cfg)

class LayerNoralization(nn.Module):
    def __init__(self, cfg):
        super(LayerNoralization, self).__init__()
        self.cfg = cfg
        self.w = nn.Parameter(torch.ones(cfg.dim_model))
        self.b = nn.Parameter(torch.zeros(cfg.dim_model))

    def forward(self, residual):
        device = residual.device  # Get the device of the input tensor

        # Move parameters to the same device as the input tensor
        w = self.w.to(device)
        b = self.b.to(device)

        # Calculate mean and variance along the d_model dimension
        mean = torch.mean(residual, dim=-1, keepdim=True)  # Shape: [batch, position, 1]
        variance = torch.mean((residual - mean) ** 2, dim=-1, keepdim=True)  # Shape: [batch, position, 1]

        # Calculate scale and normalize the residual
        scale = torch.sqrt(variance + self.cfg.layer_norm_eps)
        normalized = (residual - mean) / scale  # Normalization

        # Apply learned parameters (with moved tensors)
        normalized = normalized * w + b

        return normalized

class Embed(nn.Module):
    def __init__(self, cfg):
        super(Embed, self).__init__()
        self.cfg = cfg
        self.W_E = nn.Parameter(torch.empty((cfg.dim_vocab, cfg.dim_model))) # learnable parameter matrix representing token embeddings.
        nn.init.normal_(self.W_E, std=self.cfg.init_range)  #initializes the values of W_E using a normal distribution

    def forward(self, tokens):
        # tokens: [batch, position]
        if self.cfg.debug:
            print("Tokens:", tokens.shape)

        # Using indexing to gather embeddings
        embed = self.W_E[tokens, :]  # [batch, position, d_model] embeddings for each token index in the input tensor

        if self.cfg.debug:
            print("Embeddings:", embed.shape)

        return embed

class PositionalEmbedding(nn.Module):
    def __init__(self, cfg):
        super(PositionalEmbedding, self).__init__()
        self.cfg = cfg
        # Initialize a learnable positional embedding parameter
        self.W_pos = nn.Parameter(torch.empty((cfg.n_ctx, cfg.dim_model)))
        nn.init.normal_(self.W_pos, std=self.cfg.init_range)

    def forward(self, tokens):
        # tokens: [batch, position]
        if self.cfg.debug:
            print("Tokens:", tokens.shape)

        # Extract positional embeddings for the given positions
        pos_embed = self.W_pos[:tokens.size(1), :]  # [position, d_model]

        # Repeat the positional embeddings for each batch
        pos_embed = pos_embed.unsqueeze(0).expand(tokens.size(0), -1, -1)

        if self.cfg.debug:
            print("pos_embed:", pos_embed.shape)

        return pos_embed

class SelfAttention(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.W_Q = nn.Parameter(torch.randn(cfg.n_heads, cfg.dim_model, cfg.dim_head) * self.cfg.init_range)

        self.b_Q = nn.Parameter(torch.zeros((cfg.n_heads, cfg.dim_head)))

        self.W_K = nn.Parameter(torch.randn(cfg.n_heads, cfg.dim_model, cfg.dim_head) * self.cfg.init_range)


        self.b_K = nn.Parameter(torch.zeros((cfg.n_heads, cfg.dim_head)))


        self.W_V = nn.Parameter(torch.randn(cfg.n_heads, cfg.dim_model, cfg.dim_head) * self.cfg.init_range)


        self.b_V = nn.Parameter(torch.zeros((cfg.n_heads, cfg.dim_head)))


        self.W_O = nn.Parameter(torch.randn(cfg.n_heads, cfg.dim_head, cfg.dim_model) * self.cfg.init_range)


        self.b_O = nn.Parameter(torch.zeros((cfg.dim_model)))

        self.register_buffer("IGNORE", torch.tensor(-1e5, dtype=torch.float32, device="cuda"))

    def forward(self, normalized_resid_pre):
        # normalized_resid_pre: [batch, position, d_model]
        if self.cfg.debug: print("Normalized_resid_pre:", normalized_resid_pre.shape)

        q = einsum("batch query_pos d_model, n_heads d_model d_head -> batch query_pos n_heads d_head", normalized_resid_pre, self.W_Q) + self.b_Q

        k = einsum("batch key_pos d_model, n_heads d_model d_head -> batch key_pos n_heads d_head", normalized_resid_pre, self.W_K) + self.b_K

        attn_scores = einsum("batch query_pos n_heads d_head, batch key_pos n_heads d_head -> batch n_heads query_pos key_pos", q, k)



        attn_scores /= self.cfg.dim_head ** 0.5

        attn_scores = self.apply_causal_mask(attn_scores)

        pattern = attn_scores.softmax(dim=-1) # [batch, n_head, query_pos, key_pos]

        v = einsum("batch key_pos d_model, n_heads d_model d_head -> batch key_pos n_heads d_head", normalized_resid_pre, self.W_V) + self.b_V

        z = einsum("batch n_heads query_pos key_pos, batch key_pos n_heads d_head -> batch query_pos n_heads d_head", pattern, v)

        attn_out = einsum("batch query_pos n_heads d_head, n_heads d_head d_model -> batch query_pos d_model", z, self.W_O) + self.b_O
        return attn_out
    def apply_causal_mask(self, attn_scores):
        # Create a mask to hide future positions
        future_mask = torch.triu(torch.ones_like(attn_scores), diagonal=1).bool()
        attn_scores.masked_fill_(future_mask, self.IGNORE)

        return attn_scores

class MultilayerPerceptron(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.W_in = nn.Parameter(torch.randn(cfg.dim_model, cfg.dim_mlp) * self.cfg.init_range)


        self.b_in = nn.Parameter(torch.zeros((cfg.dim_mlp)))

        self.W_out = nn.Parameter(torch.randn(cfg.dim_mlp, cfg.dim_model) * self.cfg.init_range)


        self.b_out = nn.Parameter(torch.zeros((cfg.dim_model)))

    def forward(self, normalized_resid_mid):
        # normalized_resid_mid: [batch, position, d_model]
        if self.cfg.debug:
            print("Normalized_resid_mid:", normalized_resid_mid.shape) #Normalized_resid_mid: torch.Size([2, 3, 768])

        pre = einsum("batch position dim_model, dim_model dim_mlp -> batch position dim_mlp", normalized_resid_mid, self.W_in) + self.b_in
        post = 0.5 * pre * (1.0 + torch.tanh(np.sqrt(2.0 / np.pi) * (pre + 0.044715 * torch.pow(pre, 3.0))))  #GELU Score
        mlp_out = einsum("batch position dim_mlp, dim_mlp dim_model -> batch position dim_model", post, self.W_out) + self.b_out
        return mlp_out

class TransformerDecoderBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg


        self.ln1 = LayerNoralization(cfg)
        self.attn = SelfAttention(cfg)
        self.ln2 = LayerNoralization(cfg)
        self.mlp = MultilayerPerceptron(cfg)


    def forward(self, resid_pre):
        # resid_pre [batch, position, d_model]
        normalized_resid_pre = self.ln1(resid_pre)
        attn_out = self.attn(normalized_resid_pre)
        resid_mid = resid_pre + attn_out

        normalized_resid_mid = self.ln2(resid_pre + attn_out)
        mlp_out = self.mlp(normalized_resid_mid)
        resid_post = resid_pre + attn_out + mlp_out
        return resid_post


class GPT2_Custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.embed = Embed(cfg)
        self.pos_embed = PositionalEmbedding(cfg)
        self.blocks = nn.ModuleList([TransformerDecoderBlock(cfg) for _ in range(cfg.n_layers)])
        self.ln_final = LayerNoralization(cfg)
        self.linear_layer = nn.Linear(cfg.dim_model, cfg.dim_vocab)  # Adding a linear layer

    def forward(self, tokens):
        # tokens [batch, position]
        embed = self.embed(tokens)
        pos_embed = self.pos_embed(tokens)
        residual = embed + pos_embed
        for block in self.blocks:
            residual = block(residual)
        normalized_resid_final = self.ln_final(residual)

        logits = self.linear_layer(normalized_resid_final)

        # logits have shape [batch, position, logits]
        return logits


batch_size = 5
num_epochs = 10
max_steps = 100
log_every = 10
lr = 1e-3
weight_decay = 1e-2
model_cfg = Configuration(dim_model=768, n_heads=12, dim_head=64, dim_mlp=1024, n_layers=12, n_ctx=1024, dim_vocab=50257)


# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
# vocab_size = 2000
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

batch_size = 8
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
train_data = data[:(int(0.9*len(data)))]
val_data = data[(int(0.9*len(data))):]





#####################################################################################################################################################
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '63000'

    # initialize the process group
    init_process_group(backend="nccl",rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()



def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - 256, (batch_size,))
    x = torch.stack([data[i:i+256] for i in ix])
    y = torch.stack([data[i+1:i+256+1] for i in ix])
    return x, y


#####################################################################################################################################################


def lm_cross_entropy_loss(logits, targets):
    targets = targets.to(logits.device)
    logits_flat = logits.view(-1, logits.size(-1))
    targets_flat = targets.view(-1)
    # Calculate cross-entropy loss
    loss = F.cross_entropy(logits_flat, targets_flat)
    return loss

losses = []
def train_loop(rank,model, optimizer, encoded_data, num_epochs, log_every, max_steps):
#     losses = []
    for epoch in range(num_epochs):
        for c, batch in enumerate(encoded_data):
            tokens = batch.to(rank)
            logits = model(tokens)

            loss = lm_cross_entropy_loss(logits, tokens)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.item())
            if c % log_every == 0:
                print(f"Epoch: {epoch}, Step: {c}, Loss: {loss.item():.4f}")
            if c > max_steps:
                break


def Demo(rank,world_size):
    print(f"Running basic DDP example on rank {rank}.")
    setup(rank, world_size)
    encoded_data= get_batch(train_data[:200000])
    model = GPT2_Custom_model(model_cfg).to(rank)
    ddp_model = DDP(model, device_ids=[rank])
    optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=lr, weight_decay=weight_decay)
    train_loop(rank,model, optimizer, encoded_data, num_epochs, log_every, max_steps)
#     print(f"rank {rank}, Loss: {loss.item():.4f}")

    # Cleanly exit the distributed environment
    cleanup()

def run_demo(demo_fn, world_size):
    mp.spawn(demo_fn,
             args=(world_size,),
             nprocs=world_size,
             join=True)

if __name__ == "__main__":
    n_gpus = torch.cuda.device_count()
    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
    world_size = n_gpus
    run_demo(Demo, world_size)


Overwriting TASK_3_2DDP_Multiple_GPU.py


In [None]:
!python TASK_3_2DDP_Multiple_GPU.py

Running basic DDP example on rank 0.
Running basic DDP example on rank 1.
Epoch: 0, Step: 0, Loss: 11.0318
Epoch: 0, Step: 0, Loss: 11.0725
Epoch: 1, Step: 0, Loss: 6.8222
Epoch: 1, Step: 0, Loss: 6.8494
Epoch: 2, Step: 0, Loss: 4.9364
Epoch: 2, Step: 0, Loss: 4.9394
Epoch: 3, Step: 0, Loss: 3.7462
Epoch: 3, Step: 0, Loss: 3.7564
Epoch: 4, Step: 0, Loss: 3.4364
Epoch: 4, Step: 0, Loss: 3.4375
Epoch: 5, Step: 0, Loss: 3.3663
Epoch: 5, Step: 0, Loss: 3.3650
Epoch: 6, Step: 0, Loss: 3.3477
Epoch: 6, Step: 0, Loss: 3.3550
Epoch: 7, Step: 0, Loss: 3.3419
Epoch: 7, Step: 0, Loss: 3.3362
Epoch: 8, Step: 0, Loss: 3.3081
Epoch: 8, Step: 0, Loss: 3.2937
Epoch: 9, Step: 0, Loss: 3.2824
Epoch: 9, Step: 0, Loss: 3.2603
