In [2]:
import argparse
import os
import tiktoken
import torch
from pathlib import Path
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [3]:
class PreTokenizedGPTDataset(Dataset):
    def __init__(self, tokenized_file_path):
        data = torch.load(tokenized_file_path)
        self.input_ids = data['input_ids']
        self.target_ids = data['target_ids']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


# Function to create a dataloader from a pre-tokenized file
def create_dataloader_from_tokenized(tokenized_file_path, batch_size=4, 
                                    shuffle=True, drop_last=True, num_workers=0):
    dataset = PreTokenizedGPTDataset(tokenized_file_path)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last, 
        num_workers=num_workers
    )
    return dataloader

In [4]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
import os

# Define the PreTokenizedGPTDataset class
class PreTokenizedGPTDataset(Dataset):
    def __init__(self, tokenized_file_path):
        data = torch.load(tokenized_file_path)
        self.input_ids = data['input_ids']
        self.target_ids = data['target_ids']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# Function to create a dataloader from a pre-tokenized file
def create_dataloader_from_tokenized(tokenized_file_path, batch_size=4, 
                                    shuffle=True, drop_last=True, num_workers=0):
    dataset = PreTokenizedGPTDataset(tokenized_file_path)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last, 
        num_workers=num_workers
    )
    return dataloader

# Example usage:
# 1. Initialize with a specific tokenized file
tokenized_file_path = "../03_bonus_pretraining_on_gutenberg/tokenized_data/combined_1.pt"  # Replace with your actual file path

# 2. Create a dataset instance
dataset = PreTokenizedGPTDataset(tokenized_file_path)
print(f"Dataset contains {len(dataset)} sequences")

# 3. Let's examine a sample from the dataset
sample_input, sample_target = dataset[0]  # Get the first sample
print(f"Sample input shape: {sample_input.shape}")
print(f"Sample target shape: {sample_target.shape}")

# 4. Check the first few tokens of the sample
print("First 10 tokens of input:", sample_input[:10].tolist())
print("First 10 tokens of target:", sample_target[:10].tolist())

# 5. Create a dataloader
batch_size = 4
dataloader = create_dataloader_from_tokenized(
    tokenized_file_path,
    batch_size=batch_size,
    shuffle=True
)
print(f"Dataloader contains {len(dataloader)} batches")

# 6. Iterate through a batch to see the structure
for batch_idx, (input_batch, target_batch) in enumerate(dataloader):
    print(f"Batch {batch_idx}:")
    print(f"Input batch shape: {input_batch.shape}")  # Should be [batch_size, sequence_length]
    print(f"Target batch shape: {target_batch.shape}")
    
    # Only look at the first batch
    break


IndexError: too many indices for tensor of dimension 1

In [5]:
# First, let's check the actual structure of your tokenized file
import torch

# Replace with your actual file path
tokenized_file_path = "../03_bonus_pretraining_on_gutenberg/tokenized_data/combined_1.pt"

# Load the file and inspect its structure
data = torch.load(tokenized_file_path)

# Check what type of data it is
print(f"Data type: {type(data)}")

# If it's a tensor, check its shape
if isinstance(data, torch.Tensor):
    print(f"Tensor shape: {data.shape}")
    print(f"First 10 elements: {data[:10]}")
    
# If it's a dictionary, check its keys
elif isinstance(data, dict):
    print(f"Dictionary keys: {list(data.keys())}")
    
# Now, let's define a more flexible dataset class that can handle different formats

class FlexibleTokenizedDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_file_path, context_length=1024, stride=768):
        # Load the tokenized data
        data = torch.load(tokenized_file_path)
        
        self.context_length = context_length
        self.stride = stride
        
        # Handle different possible formats
        if isinstance(data, dict) and 'input_ids' in data and 'target_ids' in data:
            # The data is already in input/target format
            self.input_ids = data['input_ids']
            self.target_ids = data['target_ids']
            self.preformatted = True
        elif isinstance(data, torch.Tensor):
            # The data is just a tensor of tokens, so we need to create the sliding windows
            self.tokens = data
            self.preformatted = False
        else:
            raise ValueError(f"Unsupported data format: {type(data)}")
    
    def __len__(self):
        if self.preformatted:
            return len(self.input_ids)
        else:
            # Number of possible windows
            return max(0, len(self.tokens) - self.context_length)
    
    def __getitem__(self, idx):
        if self.preformatted:
            # Return the pre-formatted input/target pair
            return self.input_ids[idx], self.target_ids[idx]
        else:
            # Create the input/target pair on-the-fly
            start_idx = idx
            end_idx = start_idx + self.context_length
            
            input_ids = self.tokens[start_idx:end_idx]
            target_ids = self.tokens[start_idx+1:end_idx+1]
            
            return input_ids, target_ids

# Function to create a dataloader with the flexible dataset
def create_flexible_dataloader(tokenized_file_path, batch_size=4, 
                             context_length=1024, stride=768,
                             shuffle=True, drop_last=True, num_workers=0):
    dataset = FlexibleTokenizedDataset(
        tokenized_file_path,
        context_length=context_length,
        stride=stride
    )
    
    dataloader = torch.utils.data.DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last, 
        num_workers=num_workers
    )
    
    return dataloader

# Let's try the new class
try:
    dataset = FlexibleTokenizedDataset(tokenized_file_path)
    print(f"Dataset contains {len(dataset)} sequences")
    
    # Get a sample
    if len(dataset) > 0:
        sample_input, sample_target = dataset[0]
        print(f"Sample input shape: {sample_input.shape}")
        print(f"Sample target shape: {sample_target.shape}")
        print(f"First 10 tokens of input: {sample_input[:10].tolist()}")
        print(f"First 10 tokens of target: {sample_target[:10].tolist()}")
    
    # Create a dataloader
    batch_size = 4
    dataloader = create_flexible_dataloader(
        tokenized_file_path,
        batch_size=batch_size,
        shuffle=True
    )
    print(f"Dataloader contains {len(dataloader)} batches")
    
    # Check a batch
    for batch_idx, (input_batch, target_batch) in enumerate(dataloader):
        print(f"Batch {batch_idx}:")
        print(f"Input batch shape: {input_batch.shape}")
        print(f"Target batch shape: {target_batch.shape}")
        break
        
except Exception as e:
    print(f"Error: {e}")

Data type: <class 'torch.Tensor'>
Tensor shape: torch.Size([28909072])
First 10 elements: tensor([ 220,  220,  220,  220,  220, 4091,  642, 3104, 4310,   12])
Dataset contains 28908048 sequences
Sample input shape: torch.Size([1024])
Sample target shape: torch.Size([1024])
First 10 tokens of input: [220, 220, 220, 220, 220, 4091, 642, 3104, 4310, 12]
First 10 tokens of target: [220, 220, 220, 220, 4091, 642, 3104, 4310, 12, 71]
Dataloader contains 7227012 batches
Batch 0:
Input batch shape: torch.Size([4, 1024])
Target batch shape: torch.Size([4, 1024])


In [9]:
# Import necessary libraries
import torch
from torch.utils.data import DataLoader
import tiktoken

# Flexible dataset class with token decoding capability
class FlexibleTokenizedDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_file_path, context_length=1024, stride=768):
        # Load the tokenized data
        data = torch.load(tokenized_file_path)
        
        self.context_length = context_length
        self.stride = stride
        
        # Initialize tokenizer for decoding
        self.tokenizer = tiktoken.get_encoding("gpt2")
        
        # Handle different possible formats
        if isinstance(data, dict) and 'input_ids' in data and 'target_ids' in data:
            # The data is already in input/target format
            self.input_ids = data['input_ids']
            self.target_ids = data['target_ids']
            self.preformatted = True
        elif isinstance(data, torch.Tensor):
            # The data is just a tensor of tokens, so we need to create the sliding windows
            self.tokens = data
            self.preformatted = False
        else:
            raise ValueError(f"Unsupported data format: {type(data)}")
    
    def __len__(self):
        if self.preformatted:
            return len(self.input_ids)
        else:
            # Number of possible windows
            return max(0, len(self.tokens) - self.context_length)
    
    def __getitem__(self, idx):
        if self.preformatted:
            # Return the pre-formatted input/target pair
            return self.input_ids[idx], self.target_ids[idx]
        else:
            # Create the input/target pair on-the-fly
            start_idx = idx
            end_idx = start_idx + self.context_length
            
            input_ids = self.tokens[start_idx:end_idx]
            target_ids = self.tokens[start_idx+1:end_idx+1]
            
            return input_ids, target_ids
    
    def decode_tokens(self, tokens):
        """Decode tokens back to text"""
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.tolist()
        return self.tokenizer.decode(tokens)

# Function to create a dataloader with the flexible dataset
def create_flexible_dataloader(tokenized_file_path, batch_size=4, 
                             context_length=1024, stride=768,
                             shuffle=True, drop_last=True, num_workers=0):
    dataset = FlexibleTokenizedDataset(
        tokenized_file_path,
        context_length=context_length,
        stride=stride
    )
    
    dataloader = torch.utils.data.DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last, 
        num_workers=num_workers
    )
    
    return dataloader, dataset

# Demonstration of using the decoder
# Replace with your actual file path
tokenized_file_path = "../03_bonus_pretraining_on_gutenberg/tokenized_data/combined_4.pt"

# Create dataset and dataloader
dataset = FlexibleTokenizedDataset(tokenized_file_path)
print(f"Dataset contains {len(dataset)} sequences")

# Get a sample
if len(dataset) > 0:
    sample_input, sample_target = dataset[0]
    print(f"Sample input shape: {sample_input.shape}")
    
    # Decode a portion of the input tokens
    print("\n--- Sample input text (first 100 tokens) ---")
    decoded_input = dataset.decode_tokens(sample_input[:100])
    print(decoded_input)
    
    print("\n--- Sample target text (first 100 tokens) ---")
    decoded_target = dataset.decode_tokens(sample_target[:100])
    print(decoded_target)
    
    # Show the relationship between input and target
    print("\n--- Input/target relationship (first 10 tokens) ---")
    print(f"Input tokens: {sample_input[:10].tolist()}")
    print(f"Target tokens: {sample_target[:10].tolist()}")
    print("Notice how the target is shifted one position relative to the input (next-token prediction)")

# Create a dataloader
batch_size = 4
dataloader, _ = create_flexible_dataloader(
    tokenized_file_path,
    batch_size=batch_size,
    shuffle=True
)
print(f"\nDataloader contains {len(dataloader)} batches")

# Function to examine a batch with decoding
def examine_batch(batch, dataset):
    input_batch, target_batch = batch
    print(f"Batch shapes: input {input_batch.shape}, target {target_batch.shape}")
    
    # Decode the first example in the batch
    example_idx = 0
    print(f"\n--- Example {example_idx} from batch ---")
    input_example = input_batch[example_idx]
    target_example = target_batch[example_idx]
    
    # Decode the first 50 tokens
    print("Beginning of input sequence:")
    print(dataset.decode_tokens(input_example[:50]))
    
    print("\nBeginning of target sequence:")
    print(dataset.decode_tokens(target_example[:50]))

# Check a batch
for batch_idx, batch in enumerate(dataloader):
    print(f"\nBatch {batch_idx}:")
    examine_batch(batch, dataset)
    break  # Just examine the first batch

# Utility function to analyze token distribution
def analyze_token_distribution(dataset, num_samples=10):
    """Analyze the distribution of tokens in a dataset"""
    token_counts = {}
    
    # Sample from the dataset
    for i in range(min(num_samples, len(dataset))):
        idx = i  # You could make this random: random.randint(0, len(dataset)-1)
        input_ids, _ = dataset[idx]
        
        # Count tokens
        for token in input_ids.tolist():
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1
    
    # Get most common tokens
    sorted_tokens = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
    most_common = sorted_tokens[:20]  # Top 20 most common tokens
    
    print("\n--- Most common tokens ---")
    for token, count in most_common:
        decoded = dataset.decode_tokens([token])
        print(f"Token {token}: '{decoded}' (count: {count})")

# Analyze token distribution
analyze_token_distribution(dataset)

Dataset contains 30268742 sequences
Sample input shape: torch.Size([1024])

--- Sample input text (first 100 tokens) ---

                                LONDON
                        FREDERICK WARNE AND CO.
                             AND NEW YORK


--- Sample target text (first 100 tokens) ---
                                LONDON
                        FREDERICK WARNE AND CO.
                             AND NEW YORK
 

--- Input/target relationship (first 10 tokens) ---
Input tokens: [198, 220, 220, 220, 220, 220, 220, 220, 220, 220]
Target tokens: [220, 220, 220, 220, 220, 220, 220, 220, 220, 220]
Notice how the target is shifted one position relative to the input (next-token prediction)

Dataloader contains 7567185 batches

Batch 0:
Batch shapes: input torch.Size([4, 1024]), target torch.Size([4, 1024])

--- Example 0 from batch ---
Beginning of input sequence:
 chamber; but their tenure of
life is independent of that of their objects, since thought may be
prophetic or remini