# Working with Text Data

In [1]:
import re
from typing import List, Tuple

import tiktoken
from torch import arange, manual_seed, nn, tensor, Tensor
from torch.utils.data import DataLoader, Dataset

In [2]:
manual_seed(123)

<torch._C.Generator at 0x70a780d19750>

## Tokenizing Text

In [3]:
def load_text(file_path: str) -> str:
    """
    Load text from a file

    Args:
        - file_path (str): path to the file

    Returns:
        - text (str): text from the file
    """
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

In [4]:
# Load the text
raw_text = load_text("../data/the-verdict.txt")
print(f"Total number of character: {len(raw_text)}")

Total number of character: 20479


In [5]:
# Preprocess the text and split into tokens
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(f"Total number of tokens: {len(preprocessed)}")

Total number of tokens: 4690


In [6]:
# Identify unique tokens to build vocabulary
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 1130


In [7]:
class SimpleTokenizer:
    """
    A simple tokenizer that converts text to token IDs and vice versa.

    Attributes:
        - token_to_id (Dict[str, int]): mapping from token to ID
        - id_to_token (Dict[int, str]): mapping from ID to token

    Note:
        - This one includes code from V1 and V2 of SimpleTokenizer in the book
    """
    def __init__(self, words: List[str]) -> None:
        """
        Initialize the tokenizer

        Args:
            - words (List[str]): list of words in the text
        """
        for special_token in ["<|endoftext|>", "<|unk|>"]:
            if special_token not in words:
                words.append(special_token)
        self.token_to_id = {token:integer for integer,token in enumerate(words)}
        self.id_to_token = {integer: token for token, integer in self.token_to_id.items()}

    def encode(self, input_text: str) -> List[int]:
        """
        Convert text to token IDs

        Args:
            - input_text (str): input text

        Returns:
            - List[int]: list of token IDs
        """
        pre_processed_text = re.split(r'([,.?_!"()\']|--|\s)', input_text)
        pre_processed_text = [item.strip() for item in pre_processed_text if item.strip()]
        # Replace OOV words with <|unk|>
        pre_processed_text = [item if item in self.token_to_id else "<|unk|>" for item in pre_processed_text]
        ids = [self.token_to_id[s] for s in pre_processed_text]
        return ids

    def decode(self, ids: List[int]) -> str:
        """
        Convert token IDs back to text

        Args:
            - ids (List[int]): list of token IDs

        Returns:
            - str: text
        """
        text = " ".join([self.id_to_token[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)     # Remove space before punctuation
        return text

tokenizer = SimpleTokenizer(all_words)

In [8]:
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [9]:
decoded_text = tokenizer.decode(ids)
print(decoded_text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [10]:
# Case - Just OOV words
ids = tokenizer.encode(input_text="Hello, do you like tea?")
decoded_text = tokenizer.decode(ids)
print(decoded_text)

<|unk|>, do you like tea?


In [11]:
# Case - OOV words and special tokens
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
ids = tokenizer.encode(text)
decoded_text = tokenizer.decode(ids)
print(decoded_text)

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## Byte Pair Encoding

In [12]:
tokenizer = tiktoken.get_encoding("gpt2")

In [13]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunkownPalace."
ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 617, 2954, 593, 11531, 558, 13]


In [14]:
decoded_text = tokenizer.decode(ids)
print(decoded_text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the someunkownPalace.


In [15]:
unknown_word = "Akwirw ier"
ids = tokenizer.encode(unknown_word)
print(f"Token IDs for unknown word: {ids}")
for entry in ids:
    decoded_text = tokenizer.decode([entry])
    print(f"ID: {entry}, Token: {decoded_text}")

Token IDs for unknown word: [33901, 86, 343, 86, 220, 959]
ID: 33901, Token: Ak
ID: 86, Token: w
ID: 343, Token: ir
ID: 86, Token: w
ID: 220, Token:  
ID: 959, Token: ier


## Data Sampling with a Sliding Window

In [16]:
class GPTDatasetV1(Dataset):
    """
    A PyTorch Dataset for text data

    Attributes:
        - input_ids (List[Tensor]): List of input token IDs
        - target_ids (List[Tensor]): List of target token IDs
    """
    def __init__(self, txt: str, tokenizer: tiktoken.core.Encoding, max_length: int, stride: int) -> None:
        """
        Initialize the dataset

        Args:
            - txt (str): Text data
            - tokenizer (tiktoken.core.Encoding): Tokenizer object from TikToken
            - max_length (int): Maximum length of the input sequence
            - stride (int): Stride for the sliding window
        """
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(tensor(input_chunk))
            self.target_ids.append(tensor(target_chunk))

    def __len__(self) -> int:
        """
        Get the number of samples

        Returns:
            - int: Number of samples
        """
        return len(self.input_ids)

    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        """
        Get a sample from the dataset

        Args:
            - idx (int): Index of the sample

        Returns:
            - Tuple[Tensor, Tensor]: input and target token IDs
        """
        return self.input_ids[idx], self.target_ids[idx]

In [17]:
def create_dataloader_v1(txt: str, batch_size: int = 4, max_length: int = 256, stride: int = 128, shuffle:bool = True, drop_last:bool = True) -> DataLoader:
    """
    Create a PyTorch DataLoader for text data

    Args:
        - txt (str): Text data
        - batch_size (int): Batch size
        - max_length (int): Maximum length of the input sequence
        - stride (int): Stride for the sliding window
        - shuffle (bool): Shuffle the data
        - drop_last (bool): Drop the last incomplete batch

    Returns:
        - DataLoader: PyTorch DataLoader
    """
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
    )
    return dataloader

In [18]:
# Load the text
raw_text = load_text("../data/the-verdict.txt")

In [19]:
# Create a DataLoader and get the first batch
dataloader = create_dataloader_v1(txt=raw_text, batch_size=8, max_length=256, stride=256)
inputs, targets = next(iter(dataloader))
print(f"Input shape: {inputs.shape}")
print(f"Target shape: {targets.shape}")

Input shape: torch.Size([8, 256])
Target shape: torch.Size([8, 256])


## Creating Token Embeddings and Encoding Positional Information

In [20]:
vocab_size = tokenizer.n_vocab
output_dim = 768    # Dimension of the output embeddings in GPT-2
context_length = inputs.shape[1]
print(f"Vocabulary size: {vocab_size}, Output dimension: {output_dim}, Context length: {context_length}")

Vocabulary size: 50257, Output dimension: 768, Context length: 256


In [21]:
# Create token embeddings to map token IDs to a dense vector representation
token_embedding_layer = nn.Embedding(vocab_size, output_dim)
token_embeddings = token_embedding_layer(inputs)
print(f"Token embeddings shape: {token_embeddings.shape}")

Token embeddings shape: torch.Size([8, 256, 768])


In [22]:
# Create positional embeddings to add positional awareness
positional_embedding_layer = nn.Embedding(context_length, output_dim)
positional_embeddings = positional_embedding_layer(arange(context_length))
print(f"Positional embeddings shape: {positional_embeddings.shape}")

Positional embeddings shape: torch.Size([256, 768])


In [23]:
# Add token and positional embeddings to create input embeddings.
# Positional embeddings are broadcasted across the batch dimension.
input_embeddings = token_embeddings + positional_embeddings
print(f"Input embeddings shape: {input_embeddings.shape}")

Input embeddings shape: torch.Size([8, 256, 768])
