In [1]:
pip install huggingface_hub datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm
import psutil
import os

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024 / 1024  # Convert bytes to GB

def log_memory(message):
    """Log memory usage with a message"""
    print(f"{message} - Memory usage: {get_memory_usage():.2f} GB")

def process_dataset(dataset, column_name, chunk_size=1000):
    """Process a single dataset in chunks with progress bar"""
    total_len = len(dataset)
    all_images = []
    all_latex = []
    
    # Create chunks of indices
    chunks = range(0, total_len, chunk_size)
    
    # Process chunks with progress bar
    with tqdm(total=total_len, desc="Processing dataset") as pbar:
        for i in chunks:
            # Get chunk indices
            end_idx = min(i + chunk_size, total_len)
            chunk = dataset.select(range(i, end_idx))
            
            # Append data
            all_images.extend(chunk['image'])
            all_latex.extend(chunk[column_name])
            
            # Update progress bar
            pbar.update(end_idx - i)
            
            # Clear memory
            del chunk
            gc.collect()
    
    return all_images, all_latex

def create_splits(images, latex, train_size=0.6, val_size=0.2, test_size=0.2):
    """Create dataset splits with progress bar"""
    total_samples = len(images)
    
    with tqdm(total=3, desc="Creating splits") as pbar:
        # Create train/test split
        train_val_size = train_size + val_size
        train_val_images, test_images, train_val_latex, test_latex = train_test_split(
            images, latex, test_size=test_size, random_state=42
        )
        pbar.update(1)
        
        # Create train/val split
        val_size_adjusted = val_size / train_val_size
        train_images, val_images, train_latex, val_latex = train_test_split(
            train_val_images, train_val_latex, test_size=val_size_adjusted, random_state=42
        )
        pbar.update(1)
        
        # Create datasets
        splits = DatasetDict({
            'train': Dataset.from_dict({'image': train_images, 'latex': train_latex}),
            'validation': Dataset.from_dict({'image': val_images, 'latex': val_latex}),
            'test': Dataset.from_dict({'image': test_images, 'latex': test_latex})
        })
        pbar.update(1)
    
    return splits

def main():
    chunk_size = 20000  # Adjust based on your memory constraints
    
    log_memory("Initial memory usage")
    
    # Load and process first dataset
    print("\nProcessing dataset 1...")
    dataset1 = load_dataset("linxy/LaTeX_OCR", name="synthetic_handwrite", split="train")
    images1, latex1 = process_dataset(dataset1, 'text', chunk_size)
    del dataset1
    gc.collect()
    log_memory("After processing dataset 1")
    
    # Load and process second dataset
    print("\nProcessing dataset 2...")
    dataset2 = load_dataset("OleehyO/latex-formulas", "cleaned_formulas", split="train")
    images2, latex2 = process_dataset(dataset2, 'latex_formula', chunk_size)
    del dataset2
    gc.collect()
    log_memory("After processing dataset 2")
    
    # Combine datasets
    print("\nCombining datasets...")
    all_images = images1 + images2
    all_latex = latex1 + latex2
    del images1, images2, latex1, latex2
    gc.collect()
    log_memory("After combining datasets")
    
    # Create splits
    print("\nCreating and processing splits...")
    splits = create_splits(all_images, all_latex)
    log_memory("After creating splits")
    
    # Push to hub with progress tracking
    print("\nPushing to Hugging Face Hub...")
    splits.push_to_hub("anindya-hf-2002/pix2tex", private=True)
    log_memory("After pushing to hub")

if __name__ == "__main__":
    main()

Initial memory usage - Memory usage: 0.17 GB

Processing dataset 1...


Processing dataset:   0%|          | 0/76266 [00:00<?, ?it/s]

After processing dataset 1 - Memory usage: 10.49 GB

Processing dataset 2...


Processing dataset:   0%|          | 0/552340 [00:00<?, ?it/s]

After processing dataset 2 - Memory usage: 59.33 GB

Combining datasets...
After combining datasets - Memory usage: 59.33 GB

Creating and processing splits...


Creating splits:   0%|          | 0/3 [00:00<?, ?it/s]

After creating splits - Memory usage: 63.56 GB

Pushing to Hugging Face Hub...


Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/94291 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/943 [00:00<?, ?ba/s]

Map:   0%|          | 0/94291 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/943 [00:00<?, ?ba/s]

Map:   0%|          | 0/94291 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/943 [00:00<?, ?ba/s]

Map:   0%|          | 0/94290 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/943 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/62861 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/629 [00:00<?, ?ba/s]

Map:   0%|          | 0/62860 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/629 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/62861 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/629 [00:00<?, ?ba/s]

Map:   0%|          | 0/62861 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/629 [00:00<?, ?ba/s]

After pushing to hub - Memory usage: 64.46 GB


In [3]:
gc.collect()

328