In [2]:
from torch.utils.data import Dataset
import os
import torch
import tiktoken
import sentencepiece as spm
import subprocess
import wandb
import nltk

import torch.nn as nn
import torch.nn.functional as F

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# !pip freeze > requirements.txt

In [4]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
def check_requirements() -> bool:
    try:
        if not os.path.exists("requirements.txt"):
            raise FileNotFoundError("requirements.txt not found")

        result = subprocess.run(
            ["pip", "install", "-r", "requirements.txt"],
            check=True,  # Raise an exception if the command fails
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        print(result.stdout)
        return True

    except FileNotFoundError as e:
        print(f"Error: {e}")
        return False

    except subprocess.CalledProcessError as e:
        print(f"Error installing requirements: {e.stderr}")
        return False

In [6]:
# check_requirements()

In [7]:
# # hyperparameters
# batch_size = 32
# block_size = 8
# max_iters = 3000
# eval_interval = 300
# learning_rate = 1e-2
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embed = 32
# # ------------

In [4]:
if not os.path.exists("Dataset.txt"):
    os.system("wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
    os.rename("input.txt", 'Dataset.txt')

In [5]:
class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, data: str, mode: str = "normal"):

        self.tokens = set(nltk.word_tokenize(data))
        self.mode = mode

        if mode == "normal":
            self.chars = sorted(set(train_text))  # get characters from the input data

            self.stoi = {ch: i for i, ch in enumerate(self.chars)}  # map characters to integer indices
            self.itos = {i: ch for i, ch in enumerate(self.chars)}  # map integer indices to characters
            self.vocab_size = len(self.chars)

        elif mode == "sentencepiece":
            self.vocab_size = min(len(self.tokens), 10770)
            spm.SentencePieceTrainer.train(model_prefix='shakespeare', input='Dataset.txt',
                                           vocab_size=10770, unk_id=0, bos_id=1, eos_id=2, pad_id=3)


        elif mode == "tiktoken":
            self.enc = tiktoken.get_encoding("gpt2")
            self.vocab_size = self.enc.max_token_value + 1

    def encode(self, text):
        if self.mode == "normal":
            return [self.stoi[s] for s in text]
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.encode(text)
        elif self.mode == "tiktoken":
            return self.enc.encode(text)

    def decode(self, tokens):
        if self.mode == "normal":
            return ''.join([self.itos[t] for t in tokens])
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.decode(tokens)
        elif self.mode == "tiktoken":
            return self.enc.decode(tokens)

    def get_vocab_size(self):
        return self.vocab_size

    def __len__(self):
        return self.vocab_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        pass

    # ```

In [6]:
with open("Dataset.txt", "r") as file:
    train_text = file.read()

print(train_text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [7]:
normal_encoding = CharDataset(train_text, mode="normal")
sent_piece = CharDataset(train_text, mode="sentencepiece")
tiktoken_encoding = CharDataset(train_text, mode="tiktoken")

In [8]:
print(
    f"Normal encoding: Length of sequence = {len(normal_encoding.encode(train_text))}, Vocab size = {normal_encoding.get_vocab_size()}")

print(
    f"SentencePiece encoding: Length of sequence = {len(sent_piece.encode(train_text))}, Vocab size = {sent_piece.get_vocab_size()}")

print(
    f"TikToken encoding: Length of sequence = {len(tiktoken_encoding.encode(train_text))}, Vocab size = {tiktoken_encoding.get_vocab_size()}")

Normal encoding: Length of sequence = 1115394, Vocab size = 65
SentencePiece encoding: Length of sequence = 290364, Vocab size = 10770
TikToken encoding: Length of sequence = 338025, Vocab size = 50257


In [13]:
# data = torch.tensor(normal_encoding.encode(train_text), dtype=torch.long)
# print(data.shape, data.dtype)
# print(data[:1000])

In [14]:
# data2 = torch.tensor(sent_piece.encode(train_text), dtype=torch.long)
# print(data2.shape, data2.dtype)
# print(data2[:1000])

In [15]:
# n = int(0.9 * len(data))

# train_data = data[:n]
# val_data = data[n:]

In [16]:
# context_length = 8

# print(train_data[:context_length])
# print(normal_encoding.decode(train_data[:context_length].tolist()))

In [17]:
# train_data2 = data2[:n]
# val_data2 = data2[n:]

# print(train_data2[:context_length])
# print(sent_piece.decode(train_data2[:context_length].tolist()))

In [18]:
# x = train_data[:context_length]
# y = train_data[1:context_length + 1]

# for context in range(1, context_length):
#     print(f"context = {context}, input = {x[:context].tolist()}, target = {y[context - 1]}")

In [19]:
# xb, yb = get_batch(train_data)
# print("input")
# print(xb.shape)
# print(xb)
# print("target")
# print(yb.shape)
# print(yb)

In [9]:
def get_batch(data, context_length, batch_size, device):
    start_idx = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
    x = torch.stack([data[i: i + context_length] for i in start_idx])
    y = torch.stack([data[i + 1: i + 1 + context_length] for i in start_idx])
    x, y = x.to(device), y.to(device)
    return x, y

In [10]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, context_length, n_embd, temperature, dropout, bias):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=bias)
        self.query = nn.Linear(n_embd, head_size, bias=bias)
        self.value = nn.Linear(n_embd, head_size, bias=bias)
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))
        self.temperature = temperature

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)  # (B,T,C)
        q = self.query(x)  # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C ** -0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei / self.temperature, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [11]:
class MultiHead(nn.Module):
    """ a multi-head attention layer """

    def __init__(self, num_head, head_size, context_length, n_embd, temperature, dropout, bias):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size=head_size, context_length=context_length, n_embd=n_embd,
                                         temperature=temperature, dropout=dropout, bias=bias) for _ in range(num_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)

        return self.dropout(self.proj(out))

In [12]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [13]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    # num_head, head_size, context_length, n_embd, temperature, dropout, bias):
    def __init__(self, num_head, context_length, n_embd, temperature, dropout, bias):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // num_head
        self.sa = MultiHead(num_head, head_size, context_length, n_embd, temperature, dropout, bias)
        self.ffwd = FeedFoward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


In [14]:
class LayerNorm1d:  # (used to be BatchNorm1d)

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True)  # batch mean
        xvar = x.var(1, keepdim=True)  # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)  # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [15]:
torch.manual_seed(1337)


class BigramLangModel(nn.Module):
    def __init__(self, vocab_size, num_layer, num_head=8, head_size=16, context_length=8, n_embed=32, temperature=1.0,
                 dropout=0.0,
                 bias=False):
        # print all parameters:
        # print("THIS IS MODEL")
        # print(f"vocab_size = {vocab_size}, num_layer = {num_layer}, num_head = {num_head}, head_size = {head_size}")
        # print(f"context_length = {context_length}, n_embed = {n_embed}, temperature = {temperature}, dropout = {dropout}")
        # print(f"bias = {bias}")
        # print("_____________________")

        super().__init__()
        self.n_embed = n_embed
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position = nn.Embedding(context_length, n_embed)

        # self.self_attention_head = MultiHead(num_head=num_head, head_size=head_size, context_length=context_length,
        #                                      n_embd=n_embed, temperature=temperature, dropout=dropout, bias=bias)
        self.blocks = nn.Sequential(*[Block(num_head=num_head, context_length=context_length, n_embd=n_embed,
                                            temperature=temperature, dropout=dropout, bias=bias) for _ in
                                      range(num_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.feedforward = FeedFoward(n_embed, dropout)
        self.langhead = nn.Linear(n_embed, vocab_size)

    def forward(self, indices, targets=None):
        # T: sequence length (number of tokens) , B: batch size (number of sequences)
        B, T = indices.shape
        tok_embeds = self.token_embedding(indices)  # (B, T, n_embed)
        pos_embeds = self.position(torch.arange(T, device=indices.device))  # (T, n_embed)
        x = tok_embeds + pos_embeds  # (B, T, n_embed)
        # x = self.self_attention_head(x)  # (B, T, n_embed)
        x = self.blocks(x)
        x = self.feedforward(x)  # (B, T, n_embed)
        # logits = self.langhead(self.token_embedding(indices))  # (B, T, vocab_size)
        logits = self.langhead(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B * T, C), targets.view(B * T))

        return logits, loss

    def generate(self, init_token, max_new_tokens, context_length):
        sequence = init_token
        for itr in range(max_new_tokens):
            sequence_cropped = sequence[:, -context_length:]
            logits, loss = self(sequence_cropped)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            # next_token = torch.argmax(probs, dim=-1)
            # next_token = next_token.unsqueeze(1)
            sequence = torch.cat((sequence, next_token), dim=1)
        return sequence

In [16]:
# # hyperparameters

# batch_size = 64
# context_length = 256
# max_iters = 5000
# eval_interval = 500
# learning_rate = 3e-4
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embed = 384
# num_head = 4
# n_layer = 6
# dropout = 0.2
# temperature = 1.0
# epochs = 1

# head_size = n_embed // num_head

In [28]:
# print(f"max_iters: {max_iters}, epochs: {epochs}, steps: {steps}, eval_interval: {eval_interval}")
# print(f"learning_rate: {learning_rate}, device: {device}, eval_iters: {eval_iters}")
# print(f"n_embed: {n_embed}, num_head: {num_head}, num_layer: {n_layer}, dropout: {dropout}")
# print(f"temperature: {temperature}, context_length: {context_length}, vocab_size: {normal_encoding.get_vocab_size()}")
# print(f"head_size: {head_size}, barch_size = {batch_size}, train_rate ={0.9}" )

In [29]:
# model = BigramLangModel(vocab_size=normal_encoding.get_vocab_size(), num_layer=n_layer, n_embed=n_embed,
#                         context_length=context_length,
#                         temperature=temperature, dropout=dropout, num_head=4, head_size=head_size)

# m = model.to(device)
# logits, loss = m(indices=xb, targets=yb)
# print(logits.shape)
# print(loss)

# initial_token = torch.tensor(normal_encoding.encode('\n'), dtype=torch.long, device=device).unsqueeze(0)
# # 0 == new line char

# generated_text = normal_encoding.decode(m.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())

# print(f"Generated Sequence : {generated_text}")

In [30]:
# torch.manual_seed(1337)
# xb2, yb2 = get_batch(train_data2)
# model2 = BigramLangModel(vocab_size=sent_piece.get_vocab_size(), num_layer=n_layer, n_embed=n_embed,
#                          context_length=context_length, temperature=temperature, dropout=dropout, num_head=4,
#                          head_size=head_size)
# m2 = model2.to(device)
# logits, loss = m2(xb2, yb2)
# print(logits.shape)
# print(loss)
# initial_token = torch.tensor(sent_piece.encode('I Love You'), dtype=torch.long, device=device).unsqueeze(0)

# generated_text = sent_piece.decode(m2.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())

# print(f"Generated Sequence : {generated_text}")

In [16]:
@torch.no_grad()
def estimate_loss(model, train_data, val_data, eval_iters, context_length, batch_size, device):
    out = {}
    model.eval()
    for data in [train_data, val_data]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(data, context_length, batch_size, device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out['train' if data is train_data else 'val'] = losses.mean()
    model.train()
    return out

In [17]:
def train(model, data, val_data, context_length, batch_size, device, max_iters=5000, epochs=10, steps=100,
          eval_iters=200, eval_interval=100, learning_rate=1e-3, wandb_log=True):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for iter in range(max_iters):
            # every once in a while evaluate the loss on train and val sets
            if iter % eval_interval == 0:
                losses = estimate_loss(model, data, val_data, eval_iters, context_length, batch_size, device)
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
                total_loss += losses['train']

                if wandb_log:
                    wandb.log({"Iteration": iter, "Train Loss": losses['train'], "Val Loss": losses['val']})

            xb, yb = get_batch(data, context_length, batch_size, device)
            logits, loss = model(xb, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"epoch {epoch}: avg loss: {total_loss * eval_interval / max_iters}")
        print("-" * 50)

        if wandb_log:
            wandb.log({"Epoch": epoch + 1, "Total Loss": total_loss * eval_interval / max_iters})


In [18]:
def generate_text(model, encoding, initial_text: str, max_new_tokens: int, device, context_length: int) -> str:
    initial_token = torch.tensor(encoding.encode(initial_text), dtype=torch.long, device=device).unsqueeze(0)

    generated_text = encoding.decode(
        model.generate(context_length=context_length, init_token=initial_token, max_new_tokens=max_new_tokens)[
            0].tolist())
    return generated_text

In [19]:
def save_model(model, encoding, parameters):
    i = 0
    path = f"./model{i}.pth"
    while os.path.exists(path):
        i += 1
        path = f"./model{i}.pth"

    torch.save(
        dict(
            model=model.state_dict(),
            encoding=encoding,
            parameters=parameters
        ), path)

In [24]:
def train_save(dataset_name: str, encoding: str, parameters: dict, wandb_log=False):
    global s
    with open(dataset_name, "r") as file:
        train_text = file.read()

    encoding_name = encoding
    encoding = CharDataset(train_text, mode=encoding)

    data = torch.tensor(normal_encoding.encode(train_text), dtype=torch.long)

    n = int(parameters["train_rate"] * len(data))
    train_data = data[:n]
    val_data = data[n:]
    xb, yb = get_batch(train_data, context_length=parameters['context_length'], batch_size=parameters['batch_size'],
                       device=parameters["device"])

    model = BigramLangModel(parameters["vocab_size"], parameters["num_layer"], n_embed=parameters["n_embed"],
                            context_length=parameters["context_length"], temperature=parameters["temperature"],
                            dropout=parameters["dropout"], num_head=parameters["num_head"],
                            head_size=parameters["head_size"])

    m = model.to(parameters["device"])

    logits, loss = m(indices=xb, targets=yb)

    if wandb_log:
        wandb.init(
            project="LLM",
            config={
                "learning_rate": parameters["learning_rate"],
                "architecture": "Transformers",
                "dataset": "Shakespeare",
            },

            name=encoding_name
        )

    train(model=m, data=train_data, val_data=val_data, context_length=parameters['context_length'],
          batch_size=parameters['batch_size'], device=parameters["device"], learning_rate=parameters["learning_rate"],
          max_iters=parameters["max_iters"],
          epochs=parameters["epochs"], steps=parameters["steps"], eval_interval=parameters["eval_interval"],
          wandb_log=wandb_log)
    # return m
    generated_text = generate_text(m, encoding, "I love", 100, parameters["device"], parameters['context_length'])

    if wandb_log:
        wandb.log({"Generated Text": generated_text})
        wandb.finish()

    save_model(m, encoding, parameters)

    return m

In [21]:
def load_model(path):
    if os.path.exists(path):
        # Explicitly map the model to the CPU
        checkpoint = torch.load(path, map_location=torch.device('cpu'))
        hyperparameters = checkpoint["parameters"]
        # Use the correct number of heads from the saved model
        num_head = hyperparameters["num_head"]
        # Calculate head_size based on num_head
        head_size = hyperparameters["n_embed"] // num_head
        model = BigramLangModel(
            vocab_size=hyperparameters["vocab_size"],
            num_layer=hyperparameters["num_layer"],
            n_embed=hyperparameters["n_embed"],
            context_length=hyperparameters["context_length"],
            temperature=hyperparameters["temperature"],
            dropout=hyperparameters["dropout"],
            num_head=hyperparameters["num_head"],
            head_size=hyperparameters["head_size"]
        )
        model.load_state_dict(checkpoint["model"])
        # Return the model and hyperparameters as separate elements.
        return model, hyperparameters
    else:
        print(f"Path does not exist: {path}")
        return None, None  # Return None for both when file not found.

In [22]:
def load_and_generate(model_path: str, encoding, initial_text: str, max_new_tokens: int):
    if not load_model(model_path):
        print("Model not found")
        return

    model, parameters = load_model(model_path)

    context_length = parameters["context_length"]
    # vocab_size = encoding.get_vocab_size()

    # # Correct the vocab_size in the parameters dictionary:
    # parameters["vocab_size"] = vocab_size

    # # Ensure model is using the correct vocabulary size:
    # model.token_embedding = nn.Embedding(vocab_size, model.n_embed)
    # model.langhead = nn.Linear(model.n_embed, vocab_size)

    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = model.to(device)

    generated_text = generate_text(model, encoding, initial_text, max_new_tokens, device, context_length)
    return generated_text

In [24]:
hyperparameters = {
    "batch_size": 64,
    "context_length": 256,
    "max_iters": 5000,
    "eval_interval": 500,
    "learning_rate": 3e-4,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "eval_iters": 200,
    "n_embed": 384,
    "num_head": 4,
    "num_layer": 6,
    "dropout": 0.2,
    "temperature": 1.0,
    "epochs": 1,
    "train_rate": 0.9,
    "vocab_size": normal_encoding.get_vocab_size(),
    "steps": 500,
    "bias": False
}

hyperparameters["head_size"] = hyperparameters["n_embed"] // hyperparameters["num_head"]

In [39]:
m = train_save(dataset_name="Dataset.txt", encoding="normal", parameters=hyperparameters, wandb_log=True)

[34m[1mwandb[0m: Currently logged in as: [33mariamosavefar[0m ([33mariamosavefar-universit-de-gen-ve[0m). Use [1m`wandb login --relogin`[0m to force relogin


step 0: train loss 4.2228, val loss 4.2235
step 500: train loss 1.8823, val loss 1.9928
step 1000: train loss 1.5255, val loss 1.7086
step 1500: train loss 1.3978, val loss 1.6077
step 2000: train loss 1.3216, val loss 1.5537
step 2500: train loss 1.2686, val loss 1.5147
step 3000: train loss 1.2196, val loss 1.5030
step 3500: train loss 1.1805, val loss 1.4883
step 4000: train loss 1.1417, val loss 1.4980
step 4500: train loss 1.1076, val loss 1.4947
epoch 0: avg loss: 1.6267995834350586
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁▂▃▃▄▅▆▆▇█
Total Loss,▁
Train Loss,█▃▂▂▁▁▁▁▁▁
Val Loss,█▂▂▁▁▁▁▁▁▁

0,1
Epoch,1
Generated Text,I love yourself them...
Iteration,4500
Total Loss,1.6268
Train Loss,1.10765
Val Loss,1.49474


In [40]:
generated_text = generate_text(m, normal_encoding, "I love", 100, hyperparameters["device"],
                               hyperparameters['context_length'])
generated_text

"I love Harry's life; he was never.\n\nLADY CAPULET:\nWitnom, thene on the bitted time-first of peace?\n\nNurse:"

In [41]:
load_and_generate(model_path="./model0.pth", encoding=normal_encoding, initial_text="I hate", max_new_tokens=100)

  checkpoint = torch.load(path, map_location=torch.device('cpu'))


'I hate, hour! down them! Come, my ladom!\n\nMERCUTIO:\nReady not amuse.\n\nHORTENSIA:\nNot the silent read Barna'

In [23]:
hyperparameters = {
    "batch_size": 8,
    "context_length": 256,
    "max_iters": 10,
    "eval_interval": 500,
    "learning_rate": 3e-4,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "eval_iters": 200,
    "n_embed": 384,
    "num_head": 4,
    "num_layer": 6,
    "dropout": 0.2,
    "temperature": 1.0,
    "epochs": 1,
    "train_rate": 0.9,
    "vocab_size": sent_piece.get_vocab_size(),
    "steps": 500,
    "bias": False
}

hyperparameters["head_size"] = hyperparameters["n_embed"] // hyperparameters["num_head"]

In [26]:
m2 = train_save(dataset_name="Dataset.txt", encoding="sentencepiece", parameters=hyperparameters, wandb_log=False)

step 0: train loss 9.2610, val loss 9.2661
epoch 0: avg loss: 463.05029296875
--------------------------------------------------


In [44]:
m2 = train_save(dataset_name="Dataset.txt", encoding="sentencepiece", parameters=hyperparameters, wandb_log=True)

step 0: train loss 4.1760, val loss 4.1788
step 500: train loss 1.8488, val loss 1.9655
step 1000: train loss 1.5113, val loss 1.6999
step 1500: train loss 1.3847, val loss 1.5913
step 2000: train loss 1.3139, val loss 1.5444
step 2500: train loss 1.2575, val loss 1.5163
step 3000: train loss 1.2087, val loss 1.4950
step 3500: train loss 1.1668, val loss 1.4887
step 4000: train loss 1.1312, val loss 1.4730
step 4500: train loss 1.0902, val loss 1.4947
epoch 0: avg loss: 1.6089099645614624
--------------------------------------------------


In [35]:
load_and_generate(model_path="./model_sent1.pth", encoding=sent_piece, initial_text="I Love", max_new_tokens=100)

  checkpoint = torch.load(path, map_location=torch.device('cpu'))


'I Love showersterate lambs unlawful gotten powder Meneazed stealtheldomills spread pratingded part cormorant Polixenesded accou sootheurel cause thousands shrill hangmen easesolation honeyIalleys glad cupbearer babe fools Florizelels wean sceptre innocentcame whet warrants cedar profanation eyeb arbitrateign fondly patron gulf appoint companions without inconstantfords familiar broke wayontempt Sebastian aidtwoulddoubledliar ali instructions Showif concludes ben pitchpolitmplespregnant disease chief hair habiliments mast thempt wrinkled proclaim eaten Cry Wr pursue dishonest kissBRpillars argument threat supposesnablehereby interrupteldomys steal'