# Aria Mousavifar

In [2]:
from torch.utils.data import Dataset
import os
import torch
import tiktoken
import sentencepiece as spm
import subprocess
import wandb
import nltk

import torch.nn as nn
import torch.nn.functional as F

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# !pip freeze > requirements.txt

In [4]:
# # Login to Weights & Biases for experiment tracking
wandb.login(key="2b242cad61896bc77d8053286a9c3e79f01c9127")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
def check_requirements() -> bool:
    """
    Check and install project requirements from requirements.txt.

    Returns:
        bool: True if requirements are successfully installed, False otherwise
    """
    try:
        if not os.path.exists("requirements.txt"):
            raise FileNotFoundError("requirements.txt not found")

        # Attempt to install requirements
        result = subprocess.run(
            ["pip", "install", "-r", "requirements.txt"],
            check=True,  # Raise an exception if the command fails
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        print(result.stdout)
        return True

    except FileNotFoundError as e:
        print(f"Error: {e}")
        return False

    except subprocess.CalledProcessError as e:
        print(f"Error installing requirements: {e.stderr}")
        return False

In [6]:
# check_requirements()

In [7]:
# Download dataset if not exists
if not os.path.exists("Dataset.txt"):
    os.system("wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
    os.rename("input.txt", 'Dataset.txt')

In [8]:
class CharDataset(Dataset):
    """
    Custom Dataset class for character-level and token-level encoding.
    Supports three encoding modes: normal, sentencepiece, and tiktoken.
    """

    def __init__(self, data: str, mode: str = "normal"):
        # Extract unique tokens and initialize encoding mode
        self.tokens = set(nltk.word_tokenize(data))
        self.mode = mode

        # Normal encoding: Character-based tokenization
        if mode == "normal":

            self.chars = sorted(set(train_text))

            self.stoi = {ch: i for i, ch in enumerate(self.chars)}
            self.itos = {i: ch for i, ch in enumerate(self.chars)}
            self.vocab_size = len(self.chars)

        # SentencePiece encoding: Subword-based tokenization
        elif mode == "sentencepiece":
            self.vocab_size = min(len(self.tokens), 10770)
            spm.SentencePieceTrainer.train(model_prefix='shakespeare', input='Dataset.txt',
                                           vocab_size=10770, unk_id=0, bos_id=1, eos_id=2, pad_id=3)

        # Tiktoken encoding: GPT-2 tokenization
        elif mode == "tiktoken":
            self.enc = tiktoken.get_encoding("gpt2")
            self.vocab_size = self.enc.max_token_value + 1

    def encode(self, text):
        """
        Encode text into token IDs based on the selected mode.
        """
        if self.mode == "normal":
            return [self.stoi[s] for s in text]

        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.encode(text)

        elif self.mode == "tiktoken":
            return self.enc.encode(text)

    def decode(self, tokens):
        """
        Decode token IDs back into text based on the selected mode.
        """
        if self.mode == "normal":
            return ''.join([self.itos[t] for t in tokens])

        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.decode(tokens)

        elif self.mode == "tiktoken":
            return self.enc.decode(tokens)

    def get_vocab_size(self):
        """
        Return the vocabulary size of the dataset.
        """
        return self.vocab_size

    def __len__(self):
        """
        Return the size of the dataset.
        """
        return self.vocab_size

In [9]:
# Read dataset text
with open("Dataset.txt", "r") as file:
    train_text = file.read()

print(train_text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [10]:
# Create datasets for different encoding modes
normal_encoding = CharDataset(train_text, mode="normal")
sent_piece = CharDataset(train_text, mode="sentencepiece")
tiktoken_encoding = CharDataset(train_text, mode="tiktoken")

In [11]:
print(
    f"Normal encoding: Length of sequence = {len(normal_encoding.encode(train_text))}, Vocab size = {normal_encoding.get_vocab_size()}")

print(
    f"SentencePiece encoding: Length of sequence = {len(sent_piece.encode(train_text))}, Vocab size = {sent_piece.get_vocab_size()}")

print(
    f"TikToken encoding: Length of sequence = {len(tiktoken_encoding.encode(train_text))}, Vocab size = {tiktoken_encoding.get_vocab_size()}")

Normal encoding: Length of sequence = 1115394, Vocab size = 65
SentencePiece encoding: Length of sequence = 290364, Vocab size = 10770
TikToken encoding: Length of sequence = 338025, Vocab size = 50257


In [12]:
def get_batch(data, context_length, batch_size, device):
    start_idx = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
    x = torch.stack([data[i: i + context_length] for i in start_idx])
    y = torch.stack([data[i + 1: i + 1 + context_length] for i in start_idx])
    x, y = x.to(device), y.to(device)
    return x, y

In [13]:
# Self-attention head definition
class Head(nn.Module):
    """
    One head of self-attention
    """

    def __init__(self, head_size, context_length, n_embd, temperature, dropout, bias):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=bias)
        self.query = nn.Linear(n_embd, head_size, bias=bias)
        self.value = nn.Linear(n_embd, head_size, bias=bias)
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))  # Lower triangular matrix
        self.temperature = temperature

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Forward pass through a single self-attention head.
        """
        B, T, C = x.shape

        k = self.key(x)  # (B,T,C)
        q = self.query(x)  # (B,T,C)

        # Attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C ** -0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)

        # Masking upper triangle
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)

        # Softmax attention weights
        wei = F.softmax(wei / self.temperature, dim=-1)  # (B, T, T)

        wei = self.dropout(wei)

        # Weighted sum of the values
        v = self.value(x)  # (B,T,C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)

        return out

In [14]:
class MultiHead(nn.Module):
    def __init__(self, num_head, head_size, context_length, n_embd, temperature, dropout, bias):
        super().__init__()
        # Create heads
        heads = [Head(head_size=head_size, context_length=context_length, n_embd=n_embd,
                      temperature=temperature, dropout=dropout, bias=bias) for _ in range(num_head)]

        # Wrap heads in DataParallel if multiple GPUs are available
        self.heads = nn.DataParallel(nn.ModuleList(heads)) if torch.cuda.device_count() > 1 else nn.ModuleList(heads)

        # Final projection
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Parallel processing of heads
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

In [15]:
class FeedFoward(nn.Module):
    """
    A simple linear layer followed by a non-linearity
    """

    def __init__(self, n_embd, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [16]:
# Transformer block: Self-attention + Feed-forward network
class Block(nn.Module):
    """
    Transformer block: communication followed by computation
    """

    def __init__(self, num_head, context_length, n_embd, temperature, dropout, bias):
        super().__init__()
        head_size = n_embd // num_head
        self.sa = MultiHead(num_head, head_size, context_length, n_embd, temperature, dropout, bias)
        self.ffwd = FeedFoward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        """
        Forward pass through the transformer block.
        """

        # Add residual connection
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))

        return x


In [17]:
class LayerNorm1d:  # (used to be BatchNorm1d)
    """
    Implements Layer Normalization for 1D inputs.

    Unlike Batch Normalization, Layer Normalization normalizes across features for each input independently.
    This can be more effective for sequence data or when batch size is small.
    """

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        """
        Initialize the LayerNorm1d instance.
        """
        self.eps = eps  # Learnable parameters for scaling and shifting the normalized data
        self.gamma = torch.ones(dim)  # Scaling parameter
        self.beta = torch.zeros(dim)  # Scaling parameter

    def __call__(self, x):
        """
        Forward pass for LayerNorm1d.
        """

        # Calculate mean and variance along the feature dimension
        xmean = x.mean(1, keepdim=True)  # batch mean
        xvar = x.var(1, keepdim=True)  # batch variance

        # Normalize the input to have zero mean and unit variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)

        # Scale and shift using learnable parameters
        self.out = self.gamma * xhat + self.beta

        return self.out

    def parameters(self):
        """
        Return the list of learnable parameters.

        Returns:
            list: [gamma, beta]
        """

        return [self.gamma, self.beta]

In [18]:
torch.manual_seed(1337)


class BigramLangModel(nn.Module):
    """
    A Bigram Language Model with support for self-attention, feedforward layers, and token generation.
    """

    def __init__(self, vocab_size, num_layer, num_head=8, head_size=16, context_length=8, n_embed=32, temperature=1.0,
                 dropout=0.0,
                 bias=False):
        super().__init__()
        self.n_embed = n_embed

        # Token and positional embeddings
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position = nn.Embedding(context_length, n_embed)

        # Transformer blocks
        self.blocks = nn.Sequential(*[Block(num_head=num_head, context_length=context_length, n_embd=n_embed,
                                            temperature=temperature, dropout=dropout, bias=bias) for _ in
                                      range(num_layer)])

        # Layer normalization and feedforward layers
        self.ln_f = nn.LayerNorm(n_embed)
        self.feedforward = FeedFoward(n_embed, dropout)

        # Output head for generating logits
        self.langhead = nn.Linear(n_embed, vocab_size)

    def forward(self, indices, targets=None):
        """
        Forward pass for the model.
        """

        # T: sequence length (number of tokens) , B: batch size (number of sequences)
        B, T = indices.shape

        # Token and positional embeddings
        tok_embeds = self.token_embedding(indices)  # (B, T, n_embed)
        pos_embeds = self.position(torch.arange(T, device=indices.device))  # (T, n_embed)
        x = tok_embeds + pos_embeds  # (B, T, n_embed)

        # Transformer blocks and feedforward
        x = self.blocks(x)
        x = self.feedforward(x)  # (B, T, n_embed)

        # Output logits
        logits = self.langhead(x)  # (B, T, vocab_size)

        # Compute loss if targets are provided
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B * T, C), targets.view(B * T))

        return logits, loss

    def generate(self, init_token, max_new_tokens, context_length):
        """
        Generate new tokens autoregressively.
        """

        sequence = init_token  # Initial sequence
        for itr in range(max_new_tokens):
            # Crop context to fit within context_length
            sequence_cropped = sequence[:, -context_length:]

            # Forward pass to compute logits
            logits, loss = self(sequence_cropped)

            # Focus on the last token in the sequence
            logits = logits[:, -1, :]

            # Convert logits to probabilities using softmax
            probs = F.softmax(logits, dim=-1)

            # Sample the next token from the probability distribution
            next_token = torch.multinomial(probs, num_samples=1)

            # Append the generated token to the sequence
            sequence = torch.cat((sequence, next_token), dim=1)

        return sequence

In [19]:
@torch.no_grad()
def estimate_loss(model, train_data, val_data, eval_iters, context_length, batch_size, device):
    """
    Estimate the training and validation loss of a model over a specified number of evaluation iterations.
    """

    out = {}
    model.eval()  # Set the model to evaluation mode

    for data in [train_data, val_data]:
        # Initialize a tensor to store losses for each evaluation iteration
        losses = torch.zeros(eval_iters)

        # Loop through the specified number of evaluation iterations
        for k in range(eval_iters):
            # Sample a batch of input-target pairs
            X, Y = get_batch(data, context_length, batch_size, device)

            # Forward pass to calculate loss
            logits, loss = model(X, Y)

            # Store the loss value
            losses[k] = loss.item()

        # Compute the average loss for the dataset (train or validation)
        out['train' if data is train_data else 'val'] = losses.mean()

    model.train()  # Reset the model to training mode

    return out

In [20]:
def train(model, data, val_data, context_length, batch_size, device, max_iters=5000, epochs=10, steps=100,
          eval_iters=200, eval_interval=100, learning_rate=1e-3, wandb_log=True):
    """
    Trains a language model on the given data.
    """

    # Initialize the Adam optimizer with the model parameters and specified learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        total_loss = 0

        for iter in range(max_iters):
            # Evaluate model performance on train and validation sets at specified intervals
            if iter % eval_interval == 0:
                losses = estimate_loss(model, data, val_data, eval_iters, context_length, batch_size, device)
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
                total_loss += losses['train']  # Add training loss for averaging

                if wandb_log:
                    # Log training and validation losses to Weights & Biases
                    wandb.log({"Iteration": iter, "Train Loss": losses['train'], "Val Loss": losses['val']})

            # Get a batch of training data
            xb, yb = get_batch(data, context_length, batch_size, device)

            # Perform a forward pass and compute loss
            logits, loss = model(xb, yb)

            # Zero out gradients from the previous step, backpropagate, and update parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"epoch {epoch}: avg loss: {total_loss * eval_interval / max_iters}")
        print("-" * 50)

        if wandb_log:
            # Log the total loss for the epoch to Weights & Biases
            wandb.log({"Epoch": epoch + 1, "Total Loss": total_loss * eval_interval / max_iters})


In [53]:
def generate_text(model, encoding, initial_text: str, max_new_tokens: int, device, context_length: int) -> str:
    """
    Generates text from a given initial input using a trained language model.
    """

    # Step 1: Encode the initial input text into token IDs using the encoding object
    initial_token = torch.tensor(encoding.encode(initial_text), dtype=torch.long, device=device).unsqueeze(0)
    # `unsqueeze(0)` adds a batch dimension, turning the tensor into (1, T) where T is the token sequence length.

    # Step 2: Generate new tokens using the model
    generated_text = encoding.decode(
        model.generate(context_length=context_length, init_token=initial_token, max_new_tokens=max_new_tokens)[

            0].tolist())  # We only need the first sequence in the batch, hence the [0] and convert it to a list.

    generated_text.replace("\n", " \n")

    generated_text.replace("⁇", " \n ")

    return generated_text

In [22]:
def save_model(model, encoding, parameters):
    """
   Saves the trained model, encoding object, and additional parameters to a file.
   """
    i = 0
    path = f"./model{i}.pth"
    # Check if the path already exists. If it does, increment 'i' to create a unique file name.
    while os.path.exists(path):
        i += 1
        path = f"./model{i}.pth"

    # Step 2: Save the model state, encoding object, and parameters to the determined file path
    torch.save(
        dict(
            model=model.state_dict(),
            encoding=encoding,
            parameters=parameters
        ), path)

In [23]:
def train_save(dataset_name: str, encoding: str, parameters: dict, wandb_log=False):
    """
     Trains a Bigram Language Model on a given dataset and saves the model. Optionally logs training progress to Weights & Biases.
     """

    # Load the training data from the dataset file
    with open(dataset_name, "r") as file:
        train_text = file.read()

    # Initialize encoding (used for tokenizing the text)
    encoding_name = encoding
    encoding = CharDataset(train_text, mode=encoding)

    # Encode the text into numeric format
    data = torch.tensor(normal_encoding.encode(train_text), dtype=torch.long)

    # Split the data into training and validation sets
    n = int(parameters["train_rate"] * len(data))  # Use a fraction of the data for training
    train_data = data[:n]
    val_data = data[n:]

    # Prepare a batch of data for training
    xb, yb = get_batch(train_data, context_length=parameters['context_length'], batch_size=parameters['batch_size'],
                       device=parameters["device"])

    # Initialize the model
    model = BigramLangModel(
        parameters["vocab_size"],
        parameters["num_layer"],
        n_embed=parameters["n_embed"],
        context_length=parameters["context_length"],
        temperature=parameters["temperature"],
        dropout=parameters["dropout"],
        num_head=parameters["num_head"],
        head_size=parameters["head_size"])

    # Move the model to the specified device (CPU or GPU)
    m = model.to(parameters["device"])

    # Perform a forward pass and calculate the loss (for debugging/checking model initialization)
    logits, loss = m(indices=xb, targets=yb)

    # Initialize Weights & Biases logging if enabled
    if wandb_log:
        wandb.init(
            project="LLM",
            config={
                "learning_rate": parameters["learning_rate"],
                "architecture": "Transformers",
                "dataset": "Shakespeare",
            },

            name=encoding_name
        )
        wandb.log(parameters)

    # Train the model
    train(
        model=m,
        data=train_data,
        val_data=val_data,
        context_length=parameters['context_length'],
        batch_size=parameters['batch_size'],
        device=parameters["device"],
        learning_rate=parameters["learning_rate"],
        max_iters=parameters["max_iters"],
        epochs=parameters["epochs"],
        steps=parameters["steps"],
        eval_interval=parameters["eval_interval"],
        wandb_log=wandb_log)

    # Generate some text after training
    generated_text = generate_text(m, encoding, "I love", 100, parameters["device"], parameters['context_length'])

    # og generated text to Weights & Biases if enabled
    if wandb_log:
        wandb.log({"Generated Text": generated_text})
        wandb.finish()

    # Save the model, encoding, and parameters
    save_model(m, encoding, parameters)

    return m

In [24]:
def load_model(path):
    """
    Loads a pre-trained model from the specified checkpoint file and returns the model along with the hyperparameters.
    """

    if os.path.exists(path):
        # Load the file, mapping the model to the CPU
        loaded_file = torch.load(path, map_location=torch.device('cpu'))

        # Extract the hyperparameters from the file
        hyperparameters = loaded_file["parameters"]

        # Calculate head_size based on the number of heads and embedding size
        num_head = hyperparameters["num_head"]
        head_size = hyperparameters["n_embed"] // num_head

        model = BigramLangModel(
            vocab_size=hyperparameters["vocab_size"],
            num_layer=hyperparameters["num_layer"],
            n_embed=hyperparameters["n_embed"],
            context_length=hyperparameters["context_length"],
            temperature=hyperparameters["temperature"],
            dropout=hyperparameters["dropout"],
            num_head=hyperparameters["num_head"],
            head_size=hyperparameters["head_size"]
        )

        # Load the saved model state dict into the model
        model.load_state_dict(loaded_file["model"])

        return model, hyperparameters

    else:
        # If the checkpoint file does not exist,
        print(f"Path does not exist: {path}")
        return None, None

In [50]:
def load_and_generate(model_path: str, encoding, initial_text: str, max_new_tokens: int):
    """
    Load a pre-trained model from the specified checkpoint file and generate text using the model.
    """

    # Load the model and hyperparameters
    if not load_model(model_path):
        print("Model not found")
        return

    # Load the model and hyperparameters
    model, parameters = load_model(model_path)

    # Ensure the model is using the correct context length
    context_length = parameters["context_length"]
    # vocab_size = encoding.get_vocab_size()

    # # Correct the vocab_size in the parameters dictionary:
    # parameters["vocab_size"] = vocab_size

    # # Ensure model is using the correct vocabulary size:
    # model.token_embedding = nn.Embedding(vocab_size, model.n_embed)
    # model.langhead = nn.Linear(model.n_embed, vocab_size)

    # Move the model to the appropriate device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Generate text using the loaded model
    generated_text = generate_text(model, encoding, initial_text, max_new_tokens, device, context_length)

    return generated_text

In [26]:
hyperparameters = {
    "batch_size": 64,  # Batch size for training
    "context_length": 256,  # Number of tokens in the context for language modeling
    "max_iters": 5000,  # Maximum number of iterations to train the model
    "eval_interval": 500,  # Interval between evaluations during training
    "learning_rate": 3e-4,  # Learning rate for the optimizer
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',  # Whether to use GPU or CPU
    "eval_iters": 200,  # Number of iterations for evaluation per eval interval
    "n_embed": 384,  # Size of the token embedding vector
    "num_head": 4,  # Number of attention heads in the multi-head attention mechanism
    "num_layer": 6,  # Number of transformer layers (blocks)
    "dropout": 0.2,  # Dropout rate for regularization
    "temperature": 1.0,  # Temperature scaling for softmax during generation (controls randomness)
    "epochs": 1,  # Number of training epochs
    "train_rate": 0.9,  # Fraction of data to be used for training (remaining goes for validation)
    "vocab_size": normal_encoding.get_vocab_size(),  # Size of the vocabulary, retrieved from encoding
    "steps": 500,  # Number of steps for each training iteration
    "bias": False  # Whether to use bias in the linear layers
}

# Calculate the head size based on the number of heads and embedding size
hyperparameters["head_size"] = hyperparameters["n_embed"] // hyperparameters["num_head"]

In [27]:
m = train_save(dataset_name="Dataset.txt", encoding="normal", parameters=hyperparameters, wandb_log=True)

[34m[1mwandb[0m: Currently logged in as: [33mariamosavefar[0m ([33mariamosavefar-universit-de-gen-ve[0m). Use [1m`wandb login --relogin`[0m to force relogin


step 0: train loss 4.2228, val loss 4.2235
step 500: train loss 1.8823, val loss 1.9928
step 1000: train loss 1.5255, val loss 1.7086
step 1500: train loss 1.3978, val loss 1.6077
step 2000: train loss 1.3216, val loss 1.5537
step 2500: train loss 1.2686, val loss 1.5147
step 3000: train loss 1.2196, val loss 1.5030
step 3500: train loss 1.1805, val loss 1.4883
step 4000: train loss 1.1417, val loss 1.4980
step 4500: train loss 1.1076, val loss 1.4947
epoch 0: avg loss: 1.6267995834350586
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁▂▃▃▄▅▆▆▇█
Total Loss,▁
Train Loss,█▃▂▂▁▁▁▁▁▁
Val Loss,█▂▂▁▁▁▁▁▁▁

0,1
Epoch,1
Generated Text,I love yourself them...
Iteration,4500
Total Loss,1.6268
Train Loss,1.10765
Val Loss,1.49474


In [35]:
generated_text = generate_text(m, normal_encoding, "I love", 100, hyperparameters["device"],
                               hyperparameters['context_length'])
generated_text

'I love my way sort than you were was near\nto be achieved as our answer conner\nFrom our bodies nobly.\n\nCAMI'

In [38]:
hyperparameters = {
    "batch_size": 128,
    "context_length": 256,
    "max_iters": 5000,
    "eval_interval": 500,
    "learning_rate": 3e-4,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "eval_iters": 200,
    "n_embed": 384,
    "num_head": 4,
    "num_layer": 6,
    "dropout": 0.2,
    "temperature": 1.0,
    "epochs": 1,
    "train_rate": 0.9,
    "vocab_size": sent_piece.get_vocab_size(),
    "steps": 500,
    "bias": False
}

hyperparameters["head_size"] = hyperparameters["n_embed"] // hyperparameters["num_head"]

In [39]:
m2 = train_save(dataset_name="Dataset.txt", encoding="sentencepiece", parameters=hyperparameters, wandb_log=True)

step 0: train loss 9.2600, val loss 9.2597
step 500: train loss 1.9208, val loss 2.0134
step 1000: train loss 1.5322, val loss 1.7147
step 1500: train loss 1.3860, val loss 1.5924
step 2000: train loss 1.2971, val loss 1.5370
step 2500: train loss 1.2347, val loss 1.5010
step 3000: train loss 1.1765, val loss 1.4785
step 3500: train loss 1.1322, val loss 1.4786
step 4000: train loss 1.0803, val loss 1.4818
step 4500: train loss 1.0337, val loss 1.4939
epoch 0: avg loss: 2.1053457260131836
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁▂▃▃▄▅▆▆▇█
Total Loss,▁
Train Loss,█▂▁▁▁▁▁▁▁▁
Val Loss,█▁▁▁▁▁▁▁▁▁

0,1
Epoch,1
Generated Text,I love! is? your to ...
Iteration,4500
Total Loss,2.10535
Train Loss,1.03374
Val Loss,1.49389


In [54]:
load_and_generate(model_path="./model2.pth", encoding=sent_piece, initial_text="I Love", max_new_tokens=100)

  loaded_file = torch.load(path, map_location=torch.device('cpu'))


"I Love my ⁇  ⁇  your To thy To- are To- we we thy- thee by but-OO what by our- do what by our but- thee To alling. To- are To will by ' thys ⁇  ⁇  you; for my your of him to ⁇  not- will-ing. but by thee thyIO thy thee what- ' will To whating thyO thy do do no all are"

In [37]:
hyperparameters = {
    "batch_size": 64,
    "context_length": 256,
    "max_iters": 5000,
    "eval_interval": 500,
    "learning_rate": 3e-4,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "eval_iters": 200,
    "n_embed": 384,
    "num_head": 4,
    "num_layer": 6,
    "dropout": 0.2,
    "temperature": 1.0,
    "epochs": 1,
    "train_rate": 0.9,
    "vocab_size": tiktoken_encoding.get_vocab_size(),
    "steps": 500,
    "bias": False
}

hyperparameters["head_size"] = hyperparameters["n_embed"] // hyperparameters["num_head"]

In [31]:
m3 = train_save(dataset_name="Dataset.txt", encoding="tiktoken", parameters=hyperparameters, wandb_log=True)

step 0: train loss 10.8308, val loss 10.8309
step 500: train loss 2.0079, val loss 2.0825
step 1000: train loss 1.5936, val loss 1.7549
step 1500: train loss 1.4414, val loss 1.6365
step 2000: train loss 1.3539, val loss 1.5620
step 2500: train loss 1.2885, val loss 1.5210
step 3000: train loss 1.2377, val loss 1.5003
step 3500: train loss 1.1958, val loss 1.4908
step 4000: train loss 1.1568, val loss 1.4703
step 4500: train loss 1.1209, val loss 1.4684
epoch 0: avg loss: 2.322713851928711
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁▂▃▃▄▅▆▆▇█
Total Loss,▁
Train Loss,█▂▁▁▁▁▁▁▁▁
Val Loss,█▁▁▁▁▁▁▁▁▁

0,1
Epoch,1
Generated Text,"I loveH]L'""HUK""I`![O..."
Iteration,4500
Total Loss,2.32271
Train Loss,1.12088
Val Loss,1.46842


In [38]:
generated_text = generate_text(m3, tiktoken_encoding, "I love", 100, hyperparameters["device"],
                               hyperparameters['context_length'])

generated_text

'I loveSL";LNLY+!3YVT"JVYY\\W[PVU"Z^HSSV^Z"HUK"T\\YKLY"[OPZ!UPULZ[\'"IYLHR"OLY"NVPUN"[OHU"HUK"PU"HSS\'"[OL"YHJR'

# Testing for different parameters

## Testing for different batches

In [39]:
hyperparameters = {
    "batch_size": 64,
    "context_length": 256,
    "max_iters": 100,
    "eval_interval": 500,
    "learning_rate": 3e-4,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "eval_iters": 200,
    "n_embed": 384,
    "num_head": 4,
    "num_layer": 6,
    "dropout": 0.2,
    "temperature": 1.0,
    "epochs": 1,
    "train_rate": 0.9,
    "vocab_size": normal_encoding.get_vocab_size(),
    "steps": 500,
    "bias": False
}
hyperparameters["head_size"] = hyperparameters["n_embed"] // hyperparameters["num_head"]

In [41]:
parameters = hyperparameters.copy()

for batch_size in range(8, 128, 8):
    parameters["batch_size"] = batch_size
    m = train_save(dataset_name="Dataset.txt", encoding="normal", parameters=parameters, wandb_log=True)

generated_text = generate_text(m, normal_encoding, "I love", 100, parameters["device"],
                               parameters['context_length'])

generated_text

step 0: train loss 4.2207, val loss 4.2232
epoch 0: avg loss: 21.10334014892578
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovethlis; Galoris...
Iteration,0
Total Loss,21.10334
Train Loss,4.22067
Val Loss,4.22319
batch_size,8
bias,False
context_length,256
device,cuda


step 0: train loss 4.1844, val loss 4.1866
epoch 0: avg loss: 20.922191619873047
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,"I love, asThiss er w..."
Iteration,0
Total Loss,20.92219
Train Loss,4.18444
Val Loss,4.1866
batch_size,16
bias,False
context_length,256
device,cuda


step 0: train loss 4.2291, val loss 4.2283
epoch 0: avg loss: 21.145376205444336
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lover ple!aicees f...
Iteration,0
Total Loss,21.14538
Train Loss,4.22907
Val Loss,4.22826
batch_size,24
bias,False
context_length,256
device,cuda


step 0: train loss 4.1718, val loss 4.1714
epoch 0: avg loss: 20.858999252319336
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovechak STh hy ot...
Iteration,0
Total Loss,20.859
Train Loss,4.1718
Val Loss,4.17135
batch_size,32
bias,False
context_length,256
device,cuda


step 0: train loss 4.2542, val loss 4.2509
epoch 0: avg loss: 21.270767211914062
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,"I love, fa fan s fto..."
Iteration,0
Total Loss,21.27077
Train Loss,4.25415
Val Loss,4.25095
batch_size,40
bias,False
context_length,256
device,cuda


step 0: train loss 4.1948, val loss 4.1937
epoch 0: avg loss: 20.97412109375
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,"I loved rer, Byor br..."
Iteration,0
Total Loss,20.97412
Train Loss,4.19482
Val Loss,4.19374
batch_size,48
bias,False
context_length,256
device,cuda


step 0: train loss 4.1671, val loss 4.1685
epoch 0: avg loss: 20.835275650024414
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovell adiri s ves...
Iteration,0
Total Loss,20.83528
Train Loss,4.16706
Val Loss,4.16845
batch_size,56
bias,False
context_length,256
device,cuda


step 0: train loss 4.1968, val loss 4.1959
epoch 0: avg loss: 20.98401641845703
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I love othe PUU Wug ...
Iteration,0
Total Loss,20.98402
Train Loss,4.1968
Val Loss,4.19591
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.1882, val loss 4.1875
epoch 0: avg loss: 20.940937042236328
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovet. FOfonth; ...
Iteration,0
Total Loss,20.94094
Train Loss,4.18819
Val Loss,4.18754
batch_size,72
bias,False
context_length,256
device,cuda


step 0: train loss 4.2108, val loss 4.2064
epoch 0: avg loss: 21.05377960205078
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I loveanee cel f ll ...
Iteration,0
Total Loss,21.05378
Train Loss,4.21076
Val Loss,4.20645
batch_size,80
bias,False
context_length,256
device,cuda


step 0: train loss 4.1507, val loss 4.1519
epoch 0: avg loss: 20.753629684448242
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovephys brat Sis ...
Iteration,0
Total Loss,20.75363
Train Loss,4.15073
Val Loss,4.15188
batch_size,88
bias,False
context_length,256
device,cuda


step 0: train loss 4.1989, val loss 4.2026
epoch 0: avg loss: 20.994491577148438
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I love peow y ou fan...
Iteration,0
Total Loss,20.99449
Train Loss,4.1989
Val Loss,4.20258
batch_size,96
bias,False
context_length,256
device,cuda


step 0: train loss 4.2032, val loss 4.2011
epoch 0: avg loss: 21.015939712524414
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I love! henece hadso...
Iteration,0
Total Loss,21.01594
Train Loss,4.20319
Val Loss,4.20111
batch_size,104
bias,False
context_length,256
device,cuda


step 0: train loss 4.1885, val loss 4.1884
epoch 0: avg loss: 20.942453384399414
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I love thio th h be-...
Iteration,0
Total Loss,20.94245
Train Loss,4.18849
Val Loss,4.18841
batch_size,112
bias,False
context_length,256
device,cuda


step 0: train loss 4.1911, val loss 4.1919
epoch 0: avg loss: 20.955425262451172
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I loven therga; Yon ...
Iteration,0
Total Loss,20.95543
Train Loss,4.19108
Val Loss,4.19185
batch_size,120
bias,False
context_length,256
device,cuda


'I love wiODe t pis o s whebr wice thyoreellt, t I ars,\nAnesay pheno tus sthoreresd:-\nMps h! WI fcandbefano'

## Testing for different learning rates

In [44]:
parameters = hyperparameters.copy()

for learning_rate in [1e-3, 1e-4, 1e-5]:
    parameters["learning_rate"] = learning_rate
    parameters["head_size"] = parameters["n_embed"] // parameters["num_head"]
    m = train_save(dataset_name="Dataset.txt", encoding="normal", parameters=parameters, wandb_log=True)

generated_text = generate_text(m, normal_encoding, "I love", 100, parameters["device"],
                               parameters['context_length'])

generated_text

step 0: train loss 4.1840, val loss 4.1841
epoch 0: avg loss: 20.919912338256836
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,"I lovewit Mor wenng,..."
Iteration,0
Total Loss,20.91991
Train Loss,4.18398
Val Loss,4.18409
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.1480, val loss 4.1520
epoch 0: avg loss: 20.73981475830078
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovenomel pr owf H...
Iteration,0
Total Loss,20.73981
Train Loss,4.14796
Val Loss,4.15204
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.1845, val loss 4.1853
epoch 0: avg loss: 20.922496795654297
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovet eninain woae...
Iteration,0
Total Loss,20.9225
Train Loss,4.1845
Val Loss,4.18532
batch_size,64
bias,False
context_length,256
device,cuda


'I love jc berk woineAaraundref? im Aay c n a$b\ng tshess Ookbg h und si mylrer dth3toomnd nowriouutht ?d\n o'

## Testing for different number of heads

In [45]:
parameters = hyperparameters.copy()

for num_head in range(2, 10, 2):
    parameters["num_head"] = num_head
    parameters["head_size"] = parameters["n_embed"] // parameters["num_head"]
    m = train_save(dataset_name="Dataset.txt", encoding="normal", parameters=parameters, wandb_log=True)

generated_text = generate_text(m, normal_encoding, "I love", 100, parameters["device"],
                               parameters['context_length'])

generated_text

step 0: train loss 4.1661, val loss 4.1662
epoch 0: avg loss: 20.830724716186523
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I love irthow? I mee...
Iteration,0
Total Loss,20.83072
Train Loss,4.16614
Val Loss,4.16624
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.2128, val loss 4.2102
epoch 0: avg loss: 21.063806533813477
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I loven Gthithea'trx...
Iteration,0
Total Loss,21.06381
Train Loss,4.21276
Val Loss,4.21016
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.1615, val loss 4.1633
epoch 0: avg loss: 20.807466506958008
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I love shese mes; An...
Iteration,0
Total Loss,20.80747
Train Loss,4.16149
Val Loss,4.16332
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.1585, val loss 4.1627
epoch 0: avg loss: 20.79261016845703
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I love CAcaksond ato...
Iteration,0
Total Loss,20.79261
Train Loss,4.15852
Val Loss,4.16271
batch_size,64
bias,False
context_length,256
device,cuda


"I loveth foren t manentoue se\nPENGoat ooue your rven f t of furora thistherereshrsn'd or ath,\nLA top insth"

## Testing for different number of embedings

In [46]:
parameters = hyperparameters.copy()

for n_embed in range(128, 768, 128):
    parameters["n_embed"] = n_embed
    parameters["head_size"] = parameters["n_embed"] // parameters["num_head"]
    m = train_save(dataset_name="Dataset.txt", encoding="normal", parameters=parameters, wandb_log=True)

generated_text = generate_text(m, normal_encoding, "I love", 100, parameters["device"],
                               parameters['context_length'])

generated_text

step 0: train loss 4.1713, val loss 4.1718
epoch 0: avg loss: 20.856454849243164
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,"I loved, it $hilonda..."
Iteration,0
Total Loss,20.85645
Train Loss,4.17129
Val Loss,4.17176
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.2038, val loss 4.2028
epoch 0: avg loss: 21.018957138061523
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovear yelisore o ...
Iteration,0
Total Loss,21.01896
Train Loss,4.20379
Val Loss,4.20277
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.2243, val loss 4.2242
epoch 0: avg loss: 21.121376037597656
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lover thing me Fim...
Iteration,0
Total Loss,21.12138
Train Loss,4.22428
Val Loss,4.22424
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.2012, val loss 4.2018
epoch 0: avg loss: 21.00592613220215
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lover love aveas o...
Iteration,0
Total Loss,21.00593
Train Loss,4.20119
Val Loss,4.20184
batch_size,64
bias,False
context_length,256
device,cuda


step 0: train loss 4.2205, val loss 4.2146
epoch 0: avg loss: 21.102275848388672
--------------------------------------------------


0,1
Epoch,▁
Iteration,▁
Total Loss,▁
Train Loss,▁
Val Loss,▁
batch_size,▁
context_length,▁
dropout,▁
epochs,▁
eval_interval,▁

0,1
Epoch,1
Generated Text,I lovese liches de w...
Iteration,0
Total Loss,21.10228
Train Loss,4.22046
Val Loss,4.21463
batch_size,64
bias,False
context_length,256
device,cuda


"I love:\nByonoust. Spats;\nLou jondsur; atw\nd w wis Luk kint's;\nWlle mcoighizuif, wame w--xind ums fr tof ou"