In [3]:
!wget https://github.com/Infatoshi/fcc-intro-to-llms/blob/main/wizard_of_oz.txt

--2023-12-27 00:17:16--  https://github.com/Infatoshi/fcc-intro-to-llms/blob/main/wizard_of_oz.txt
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 277633 (271K) [text/plain]
Saving to: ‘wizard_of_oz.txt’


2023-12-27 00:17:16 (7.01 MB/s) - ‘wizard_of_oz.txt’ saved [277633/277633]



In [4]:
import json

with open("wizard_of_oz.txt", "r", encoding = "utf-8") as f:
    text = f.read()

text = json.loads(text)
text = text["payload"]["blob"]["rawLines"]
text = ''.join(line.strip('\r') for line in text)
text[:200]


'\ufeff  DOROTHY AND THE WIZARD IN OZ  BY  L. FRANK BAUM  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.  ILLUSTRATED BY JOHN R. NEILL  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK  [Il'

In [34]:
# with open("wizard_of_oz.txt", "r", encoding = "utf-8") as f:
#     text = f.read()

print("total length of text : ", len(text))
print(text[:200])

chars = sorted(list(set(text)))
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)
print()
print("total unique characters : ", vocab_size)

encode = lambda s : [stoi[c] for c in s if c in stoi]
decode = lambda l : "".join([itos[i] for i in l])

e = encode("Hello World")
d = decode(e)

print(e)
print(d)

total length of text :  226887
﻿  DOROTHY AND THE WIZARD IN OZ  BY  L. FRANK BAUM  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.  ILLUSTRATED BY JOHN R. NEILL  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK  [Il

total unique characters :  80
[31, 57, 64, 64, 67, 0, 46, 67, 70, 64, 56]
Hello World


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape)


torch.Size([226887])


In [7]:
# HYPER PARAMETER
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = "mps"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 32
block_size = 128
learning_rate = 3e-5
max_iters = 10000
eval_iters = 500
dropout = 0.2
n_emb = 384
n_layer = 16
n_head = 16

In [8]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x = x.to(device)
    y = y.to(device)
    return x, y

x,y = get_batch("train")
print("input")
print(x)
print("targets")
print(y)

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

input
tensor([[61, 67, 66,  ...,  0, 77, 67],
        [ 0, 71, 73,  ..., 53, 61, 65],
        [49, 57, 54,  ..., 68, 68, 57],
        ...,
        [ 0, 72, 60,  ..., 60,  4, 71],
        [70, 53, 66,  ..., 56, 57, 55],
        [ 2, 32,  0,  ..., 72, 60, 57]], device='cuda:0')
targets
tensor([[67, 66, 21,  ..., 77, 67, 73],
        [71, 73, 70,  ..., 61, 65, 57],
        [57, 54, 10,  ..., 68, 57, 56],
        ...,
        [72, 60, 57,  ...,  4, 71,  0],
        [53, 66, 59,  ..., 57, 55, 64],
        [32,  0, 55,  ..., 60, 57,  0]], device='cuda:0')


In [9]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)       # (vocab_size, vocab_size)
        self.position_embedding_table = nn.Embedding(vocab_size, vocab_size)    # (vocab_size, vocab_size)

    def forward(self, indx, targets=None):                                      # (B, T), (B, T)
        logits = self.token_embedding_table(indx)                               # (B, T, vocab_size)
        loss = None

        if targets is None:
            return logits, loss
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)                                        # Since cross entropy accepts the tensors in the shape (B, C, T) we are squeezing the dimentions
            targets = targets.view(B*T)                                          # squeezing the targets dimention into 1D

            loss = F.cross_entropy(logits, targets)

            return logits, loss

    def generate(self, indx, max_new_tokens):
        # index is a (B, T) tensor consisting of the inputs
        for _ in range(max_new_tokens):
            # get the output from the model
            logits, loss = self(indx)
            # focus only on the last time step
            logits = logits[:, -1, :]                                           # (B, T)
            # apply the softmax on the logits to get the probabilities on the last dim
            probs = F.softmax(logits, dim=-1)                                   # (B, T)
            # choose a index based on the above calculated probability
            indx_nxt = torch.multinomial(probs, num_samples=1)                  # (B, 1)
            # append the new index to the current index array
            indx = torch.cat((indx, indx_nxt), dim=1)                             #(B, T+1)
        return indx



In [10]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype = torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



 Bf?6FUwHsc]wzIAflBoXgJPdhfA;g5K-7TNBLhPw4kV!&LF&"EbIXx2JTuRpV8)3r&l.o6,3x[3PdOgmAp)oGSM5GcIUBnlBzSK;YK6UORhNB1lEA6FC﻿jrxN:mJTA1foQ'zIRLhC qx QTD6qO"(4ph,y'zo1d&R[tFQ"o1ZKOk4R?oowf)hO8U﻿7q?nC .u*(toTOtQoX7oXj?ewg6gr E;b(TzA1CO4qojh] d8DBN5G-]ngQd.o0VW)LvaEL3Lc35QTNC.!-j_(vO bipFzkk6Bbf)1wAvrwRF&s,[AO,ys0v*1SL.Zg&N5yL 8URdO!h0mn4U5P﻿gvEL*PEbcx!&mCw. Qma-,gPDWV8D2AzAb0.P﻿tJdwI]&49XTxj_LK6MgEp*y&vmS:9"EFEJWF6c;)LYVlfBzk[l.N,:""E-7BAwfdO10Q﻿:-7_wIncJNZn[KLFVLR3Bl7xc&4Vi"ELoYNyM9PS.QGH(qs)&8oby1Kf)"0N


In [11]:
# Now traning the model to generate non random outputs
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
losses = []

for iter in range(max_iters):
    # sample a batch of training data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

    # track stats
    if iter % eval_iters == 0:
        _loss = estimate_loss(model)
        print(f"{iter} / {max_iters} loss = {_loss}")
print(losses[-1])


0 / 10000 loss = {'train': 4.859058380126953, 'val': 4.8314008712768555}
500 / 10000 loss = {'train': 4.834041595458984, 'val': 4.808613300323486}
1000 / 10000 loss = {'train': 4.810758113861084, 'val': 4.786744594573975}
1500 / 10000 loss = {'train': 4.786906719207764, 'val': 4.764069557189941}
2000 / 10000 loss = {'train': 4.765634536743164, 'val': 4.74163818359375}
2500 / 10000 loss = {'train': 4.742489814758301, 'val': 4.719244003295898}
3000 / 10000 loss = {'train': 4.721006393432617, 'val': 4.696011543273926}
3500 / 10000 loss = {'train': 4.698643684387207, 'val': 4.674439430236816}
4000 / 10000 loss = {'train': 4.675430774688721, 'val': 4.652345180511475}
4500 / 10000 loss = {'train': 4.65268087387085, 'val': 4.6289753913879395}
5000 / 10000 loss = {'train': 4.631362438201904, 'val': 4.608608722686768}
5500 / 10000 loss = {'train': 4.608999729156494, 'val': 4.585960388183594}
6000 / 10000 loss = {'train': 4.587167739868164, 'val': 4.5646748542785645}
6500 / 10000 loss = {'train'

In [29]:
# Decoder transformer architecture
from tqdm import tqdm

class FeedForward(nn.Module):
    def __init__(self, n_emb):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


class SelfAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_emb, head_size, bias = False)                            # (C, C // head_size)
        self.query = nn.Linear(n_emb, head_size, bias = False)                          # (C, C // head_size)
        self.value = nn.Linear(n_emb, head_size, bias = False)                          # (C, C // head_suze)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape

        # get the 'key' representation
        k = self.key(x)             # (B, T, head_size)
        # get the 'query' representation
        q = self.query(x)           # (B, T, head_size)
        # get the 'value' representation
        v = self.value(x)           # (B, T, head_size)

        # mat mul of key and query
        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5     # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        # mask fill the key-query pair
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))    # (B, T, T)
        # apply softmax
        wei = F.softmax(wei, dim=-1)                            # (B, T, T)
        wei = self.dropout(wei)

        # calculate the out matrix
        out = wei @ v               # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)

        return out


class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttention(head_size=head_size) for _ in range(n_head)])
        self.droupout = nn.Dropout(dropout)
        self.proj = nn.Linear(n_head * head_size, n_emb)

    def forward(self, x):
        # concatenate the output from all the heads
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # apply dropout
        out = self.droupout(out)
        # perform a linear transformation
        out = self.proj(out)

        return out


class Block(nn.Module):
    def __init__(self, n_emb, n_head):
        super().__init__()
        # distribute the calculations evenly based on the head size
        head_size = n_emb // n_head
        self.self_attn = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_emb)
        self.ln1 = nn.LayerNorm(n_emb)
        self.ln2 = nn.LayerNorm(n_emb)

    def forward(self, x):
        # checkpoint for the residual connection
        y = self.self_attn(x)                                   # (B, T, C)
        # add the residual checkpoint to the current layer
        x = self.ln1(x + y)                                     # (B, T, C)
        # checkpoint for the residual connection
        y = self.ffwd(x)                                        # (B, T, C)
        # add the residual checkpoint to the current layer
        x = self.ln2(x + y)                                     # (B, T, C)

        return x


class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        # converting each character into a series of embedding eg: 'a' can be represented by [0, 1, 2, ... n_emb]
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)       # (vocab_size, n_emb)
        # convertng each position in the embedding table into eg: 0th element can be represented by [0, 0, 0, 0, 0 ... n_emb]
        self.position_embedding_table = nn.Embedding(block_size, n_emb)    # (vocab_size, vocab_size)
        # how many 'Decoder blocks' does this model contain ?
        self.blocks = nn.Sequential(*[Block(n_emb, n_head=n_head) for _ in range(n_layer)])
        # final layer norm
        self.ln_f = nn.LayerNorm(n_emb)
        # final linear layer for the language model
        self.lm_head = nn.Linear(n_emb, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)

    def forward(self, indx, targets=None):                                  # (B, T), (B, T)
        logits = self.token_embedding_table(indx)                           # (B, T, n_emb)
        loss = None

        # indx and targets are (B, T) tensor of integers
        B, T = indx.shape
        # get the token emebddings
        tok_emb = self.token_embedding_table(indx)                          # (B, T, n_emb) -> this can also be represented by (B, T, C)
        # get the positional embeddings -> rows should have same dimention as 'T' each row is represented by 'n_emb' tensors
        pos_emb = self.position_embedding_table(torch.arange(T, device = device))   # (T, C) -> this can also be represented by (B, T, C)
        # Add both positional and token embedding as
        x = tok_emb + pos_emb                                               # (B, T, C)
        # pass the cocktail of pos and token embedding to the Transformer Blocks
        x = self.blocks(x)                                                  # (B, T, C)
        # apply layer norm on the final output from the Transformer Blocks
        x = self.ln_f(x)                                                    # (B, T, C)
        # calcualte the logits by applying a linear transformation
        logits = self.lm_head(x)                                            # from (B, T, C {n_emb}) to (B, T, vocab_size)

        if targets is None:
            return logits, loss
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)                                        # Since cross entropy accepts the tensors in the shape (B, C, T) we are squeezing the dimentions
            targets = targets.view(B*T)                                          # squeezing the targets dimention into 1D

            loss = F.cross_entropy(logits, targets)

            return logits, loss

    def generate(self, indx, max_new_tokens):
        # index is a (B, T) tensor consisting of the inputs
        for _ in tqdm(range(max_new_tokens)):
            # crop idx to the last block_size tokens
            indx_nxt = indx[:, -block_size:]
            # get the output from the model
            logits, loss = self(indx_nxt)
            # focus only on the last time step
            logits = logits[:, -1, :]                                           # (B, T)
            # apply the softmax on the logits to get the probabilities on the last dim
            probs = F.softmax(logits, dim=-1)                                   # (B, T)
            # choose a index based on the above calculated probability
            indx_nxt = torch.multinomial(probs, num_samples=1)                  # (B, 1)
            # append the new index to the current index array
            indx = torch.cat((indx, indx_nxt), dim=1)                             #(B, T+1)
        return indx

def count_params(model):
    s = 0
    for p in model.parameters():
        if p.requires_grad:
            s += p.numel()

    return s

In [11]:
import time

model = GPTLanguageModel(vocab_size=vocab_size)
m = model.to(device, non_blocking=True)
print(device)
print("Total parameters : ", count_params(model))

# Now traning the model to generate non random outputs
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
losses = []
start_time = time.time()

for iter in range(max_iters):
    # sample a batch of training data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

    # track stats
    if iter % eval_iters == 0:
        _loss = estimate_loss(model)
        print(f"{iter} / {max_iters} loss = {_loss} time = {time.time() - start_time}")
        start_time = time.time()

print(losses[-1])



cuda
Total parameters :  28484432
0 / 10000 loss = {'train': 4.111767292022705, 'val': 4.122021675109863} time = 206.4224352836609
500 / 10000 loss = {'train': 2.4038760662078857, 'val': 2.465719699859619} time = 565.4793167114258
1000 / 10000 loss = {'train': 2.2587549686431885, 'val': 2.3316476345062256} time = 569.9537487030029
1500 / 10000 loss = {'train': 1.9816352128982544, 'val': 2.0777335166931152} time = 568.9667274951935
2000 / 10000 loss = {'train': 1.7736042737960815, 'val': 1.8770356178283691} time = 571.6390836238861
2500 / 10000 loss = {'train': 1.640278935432434, 'val': 1.767066478729248} time = 563.2495341300964
3000 / 10000 loss = {'train': 1.532592535018921, 'val': 1.6915935277938843} time = 566.9160177707672
3500 / 10000 loss = {'train': 1.4463616609573364, 'val': 1.6289525032043457} time = 566.468992471695
4000 / 10000 loss = {'train': 1.3776922225952148, 'val': 1.5893322229385376} time = 564.5683524608612
4500 / 10000 loss = {'train': 1.316519856452942, 'val': 1.5

In [12]:
torch.save(model, 'model.pt')

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F


model_path = "./model.pt"

model = torch.load(model_path)
model.eval()
m = model.to(device)

In [35]:
prompt = "In the enchanting land of Oz, a spirited young girl named Srinidhi found herself whisked away from her ordinary life into a realm of magic and whimsy. Srinidhi, much like Dorothy in the classic tale, embarked on a fantastical journey where she encountered peculiar characters and faced extraordinary challenges. Accompanied by newfound friends—a wise scarecrow, a compassionate tin woman, and a courageous lion—Srinidhi navigated the vibrant landscapes of Oz, seeking the elusive Wizard who held the key to her return home. Along the Yellow Brick Road, she discovered the strength within herself, overcoming obstacles with resilience and kindness. As the echoes of her adventures reverberated through the Emerald City, Srinidhi's tale became a modern fable of courage, friendship, and the boundless possibilities that unfold when one dares to dream."

context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=1000)[0].tolist())

gen_ch_split = 50

for i, ch in enumerate(range(len(generated_chars))[:-gen_ch_split:gen_ch_split]):
  print(generated_chars[ch:ch+gen_ch_split])


In the enchanting land of Oz, a spirited young gir
l named Srinidhi found herself whisked away from h
er ordinary life into a realm of magic and whimsy.
 Srinidhi, much like Dorothy in the classic tale, 
embarked on a fantastical journey where she encoun
tered peculiar characters and faced extraordinary 
challenges. Accompanied by newfound friendsa wise 
scarecrow, a compassionate tin woman, and a courag
eous lionSrinidhi navigated the vibrant landscapes
 of Oz, seeking the elusive Wizard who held the ke
y to her return home. Along the Yellow Brick Road,
 she discovered the strength within herself, overc
oming obstacles with resilience and kindness. As t
he echoes of her adventures reverberated through t
he Emerald City, Srinidhi's tale became a modern f
able of courage, friendship, and the boundless pos
sibilities that unfold when one dares to dream.But
 they grew stood beginning to remainin the start, 
and purrise going into the house. How for a few mo
ment and the blood weresto any 

In [None]:
from tqdm import tqdm
import os
import lzma

def xz_files_in_dir(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
            files.append(filename)
    return files

folder_path = "./"
output_file_train = "output_train.txt"
output_file_val = "output_val.txt"
vocab_file = "vocab.txt"
# split_files = int(input("How many files would you like to split this into?"))

files = xz_files_in_dir(folder_path)
total_files = len(files)

# calculate the split index
split_index = int(total_files * 0.9)
files_train = files[:split_index]
files_val = files[split_index:]

# process the files for training and validation seperately
vocab = set()

# process the training files
with open(output_file_train, "w", encoding="utf-8") as outfile:
    for filename in tqdm(files_train, total=len(files_train)):
        file_path = os.path.join(folder_path, filename)

        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

# process the validation files

with open(output_file_val, "w", encoding="utf-8") as outfile:
    for filename in tqdm(files_val, total=len(files_val)):
        file_path = os.path.join(folder_path, filename)

        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

# write the vocablury to vocab.txt
with open(vocab_file, "w", encoding = "utf-8") as vfile:
    for char in vocab:
        vfile.write(char + "/n")