In [1]:
import numpy as np 
import tiktoken as tk
import torch
import os 

In [2]:
with open ('data.txt', 'r', encoding='utf-8') as f:
    data = f.read()

print('length of dataset', len(data))

length of dataset 1115394


In [3]:
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# extract the unique characters that occur in this text 
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab size', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size 65


In [5]:
# develope a strategy to tokenize the input text 
# create a mapping from characters to integers
char_to_int = {ch:i for i,ch in enumerate(chars)} # create a dictionary that maps characters to integers
int_to_char = {i:ch for i,ch in enumerate(chars)} # create a dictionary that maps integers to characters
encode = lambda x: [char_to_int[ch] for ch in x] # take a string and convert it to a list of integers
decode = lambda x: ''.join([int_to_char[ch] for ch in x]) # take the list of integers and convert it back to a string

print('encoded:', encode('hello world'))
print('decoded:', decode(encode('hello world')))


encoded: [46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
decoded: hello world


In [6]:
# Using tiktoken instead of the above simple tokenizer
enc = tk.get_encoding('gpt2')
print(enc.encode('hello world'))
print(enc.decode(enc.encode('hello world')))

# get number of classes from the tokenizer for the data.txt file 
print('number of classes:', enc.n_vocab)



[31373, 995]
hello world
number of classes: 50257


In [7]:
# data = torch.tensor(enc.encode(data), dtype=torch.long)
# print('data shape', data.shape, '| data type', data.dtype)
# print('data example', data[:1000])

In [8]:
# train test split 80%
# n = int(0.8*len(data))
# train_data, val_data = data[:n], data[n:]

In [9]:
# block_size = 8 # the length of each sequence
# train_data[:block_size+1]

# x = train_data[:block_size]
# y = train_data[1:block_size+1]
# for t in range(block_size):
#     context, target = x[:t+1], y[t]
#     print(f'when input is {context} the target is {target}')

In [10]:
# torch.manual_seed(69)
# batch_size = 4 # independent sequences in parallel 
# block_size = 8 # length of each sequence

# def get_batch(split):
#     # generates a small batch of data for inputs x and targets y
#     data = train_data if split == 'train' else val_data
#     ix = torch.randint(len(data)- block_size, (batch_size,)) # starting index of each sequence
#     x = torch.stack([data[i:i+block_size] for i in ix]) # batch of input sequences
#     y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # batch of target sequences
#     return x, y

# x_batch, y_batch = get_batch('train')
# print('inputs', x_batch.shape, x_batch.dtype, x_batch)
# print('targets', y_batch.shape, y_batch.dtype, y_batch)

# for b in range(batch_size):
#     for t in range(block_size):
#         context, target = x_batch[b,:t+1], y_batch[b,t]
#         print(f'batch {b} | time {t} | input {context.tolist()} | target {target}')

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import tiktoken  # Ensure tiktoken is installed

class LLModel(nn.Module):
    def __init__(self, block_size=128, batch_size=32, lr=5e-5):
        """
        Initializes the LLModel class with a transformer model.
        """
        super().__init__()  # Ensure the nn.Module is properly initialized

        # Device setup
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # Tokenizer setup using tiktoken
        self.tokenizer = tiktoken.get_encoding('gpt2')
        self.token_encoder = self.tokenizer.encode
        self.token_decoder = self.tokenizer.decode
        self.vocab_size = self.tokenizer.n_vocab
        self.num_classes = 32  # Assuming token-level classification

        # Hyperparameters
        self.block_size = block_size
        self.batch_size = batch_size

        # Transformer model
        self.model = TransformerModel(
            vocab_size=self.vocab_size,
            embed_dim=128,
            num_heads=4,
            hidden_dim=256,
            num_layers=2,
            num_classes=self.num_classes
        ).to(self.device)

        # Loss function & optimizer
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.AdamW(self.model.parameters(), lr=lr)

    def get_batch(self, data, split='train'):
        """
        Generates a batch of input sequences.
        """
        # Tokenize all data
        tokenized_data = torch.tensor(self.token_encoder(data), dtype=torch.long, device=self.device)
        
        # Ensure enough data points are available
        if len(tokenized_data) <= self.block_size:
            raise ValueError("Not enough data for the given block size.")

        # Select train or validation set
        data = tokenized_data if split == 'train' else tokenized_data  # Adjust if needed

        ix = torch.randint(len(data) - self.block_size, (self.batch_size,))
        x = torch.stack([data[i : i + self.block_size] for i in ix])
        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
        return x, y

    def train_model(self, text_data, epochs=100, print_freq=10, train_val_split=0.8):
        """
        Trains the model on input text data.
        """
        best_loss = float('inf')
        n = int(train_val_split*len(data))
        train_data, val_data = text_data[:n], text_data[n:]
        x, y = self.get_batch(train_data, 'train')
        x_val, y_val = self.get_batch(val_data, 'train')
        for epoch in range(epochs):
            self.model.train()
            self.optimizer.zero_grad()
            logits = self.model(x)
            loss = self.criterion(logits.view(-1, self.num_classes), y.view(-1))
            loss.backward()
            self.optimizer.step()

            val_loss = self.criterion(self.model(x_val).view(-1, self.num_classes), y_val.view(-1))

            # Save the best model
            if loss.item() < best_loss:
                best_loss = loss.item()
                torch.save(self.model.state_dict(), 'best_model.pth')

            if epoch % print_freq == 0:
                print(f"Epoch {epoch} | Loss: {loss.item():.4f} | Val Loss: {val_loss:.4f}")

        return best_loss

    def evaluate(self, x, y):
        """
        Evaluates the model on a validation batch.
        """
        self.model.eval()
        with torch.no_grad():
            logits = self.model(x)
            loss = self.criterion(logits.view(-1, self.num_classes), y.view(-1))
        return loss.item()

    def predict(self, text):
        """
        Predicts tokens for input text.
        """
        encoded_data = self.tokenize(text).unsqueeze(0)  # Add batch dimension
        self.model.eval()
        with torch.no_grad():
            logits = self.model(encoded_data)
            predicted_tokens = torch.argmax(logits, dim=-1)
            return self.token_decoder(predicted_tokens.squeeze().tolist())


class TransformerModel(nn.Module):
    def __init__(self, vocab_size: int, embed_dim=128, num_heads=4, hidden_dim=256, num_layers=2, num_classes=50257):
        """
        Custom Transformer-based text classification model.
        """
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)  # Word embeddings
        self.positional_encoding = PositionalEncoding(embed_dim)

        self.encoder_layers = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, num_layers=num_layers)

        self.dropout = nn.Dropout(0.3)  # Regularization
        self.classifier = nn.Linear(embed_dim, num_classes)  # Final classification layer

    def forward(self, input_ids):
        """
        Forward pass of the model.
        """
        x = self.embedding(input_ids)  # Convert tokens to embeddings
        x = self.positional_encoding(x)  # Add positional encoding
        x = self.transformer_encoder(x)  # Pass through Transformer encoder
        x = x.mean(dim=1)  # Global average pooling over sequence
        x = self.dropout(x)  # Apply dropout
        logits = self.classifier(x)  # Get class logits
        return logits


class PositionalEncoding(nn.Module):
    """
    Implements positional encoding for Transformer models.
    """
    def __init__(self, embed_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:, : x.size(1), :]


In [12]:
# Initialize the model
model = LLModel()
model.train_model(data, epochs=100, print_freq=10)



ValueError: Expected input batch_size (32) to match target batch_size (4096).