Notebook for manipulating and interacting with data

In [1]:
from lib.pythia.utils.mmap_dataset import MMapIndexedDataset 

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import torch
import numpy as np

In [16]:
mmap_dataset = MMapIndexedDataset(path='/home/rd654/rds/rds-personal-3CBQLhZjXbU/data/document', skip_warmup=True)

    reading sizes...
    reading pointers...
    reading document index...
    creating numpy buffer of mmap...
    creating memory view of numpy buffer...


In [20]:
def get_sample(step: int):
    batch = mmap_dataset[step: (step+1)].squeeze(0)
    return batch 

In [21]:
import pyarrow as pa
import pyarrow.parquet as pq

In [22]:
import multiprocessing as mp 

In [23]:
import os 

cpu_count = os.cpu_count()

In [24]:
len(mmap_dataset)


146432000

In [25]:
cpu_count

76

### Converting dataset into smaller parquet chunks that can be uploaded

In [10]:
def worker(rank: int, world_size: int, mmap_dataset: MMapIndexedDataset): 

    file_name = f'/home/rd654/rds/rds-personal-3CBQLhZjXbU/data/pile/parquet/shard_{rank}.parquet'

    if os.path.exists(file_name):
        print("File already exists: ", file_name)
        return

    len_dataset = len(mmap_dataset)

    start_index = len_dataset // world_size * rank
    end_index = len_dataset // world_size * (rank + 1)

    print("Process: ", rank, "Start: ", start_index, "End: ", end_index)

    data = []
    for i in range(start_index, end_index):
        data.append(get_sample(i))

    
    table = pa.table({'ids': data}) 
    pq.write_table(table, f'/home/rd654/rds/rds-personal-3CBQLhZjXbU/data/pile/parquet/shard_{rank}.parquet')

In [None]:
processes = [
    mp.Process(target=worker, args=(rank, cpu_count, mmap_dataset)) for rank in range(cpu_count)
]

for process in processes:
    process.start()
    process.join()


In [28]:
# Doing the last index of the dataset
len_dataset = len(mmap_dataset)
start_index = len_dataset // 76 * (75 + 1)
end_index = len(mmap_dataset)
data = []
for i in range(start_index, end_index):
    data.append(get_sample(i))


table = pa.table({'ids': data}) 
pq.write_table(table, f'/home/rd654/rds/rds-personal-3CBQLhZjXbU/data/pile/parquet/shard_76.parquet')

#### Creating dataset of batches only at the checkpoint locations 

In [9]:
checkpoint_steps = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
checkpoint_steps.extend([i * 1000 for i in range(1, 144)])
checkpoint_steps

[1,
 2,
 4,
 8,
 16,
 32,
 64,
 128,
 256,
 512,
 1000,
 2000,
 3000,
 4000,
 5000,
 6000,
 7000,
 8000,
 9000,
 10000,
 11000,
 12000,
 13000,
 14000,
 15000,
 16000,
 17000,
 18000,
 19000,
 20000,
 21000,
 22000,
 23000,
 24000,
 25000,
 26000,
 27000,
 28000,
 29000,
 30000,
 31000,
 32000,
 33000,
 34000,
 35000,
 36000,
 37000,
 38000,
 39000,
 40000,
 41000,
 42000,
 43000,
 44000,
 45000,
 46000,
 47000,
 48000,
 49000,
 50000,
 51000,
 52000,
 53000,
 54000,
 55000,
 56000,
 57000,
 58000,
 59000,
 60000,
 61000,
 62000,
 63000,
 64000,
 65000,
 66000,
 67000,
 68000,
 69000,
 70000,
 71000,
 72000,
 73000,
 74000,
 75000,
 76000,
 77000,
 78000,
 79000,
 80000,
 81000,
 82000,
 83000,
 84000,
 85000,
 86000,
 87000,
 88000,
 89000,
 90000,
 91000,
 92000,
 93000,
 94000,
 95000,
 96000,
 97000,
 98000,
 99000,
 100000,
 101000,
 102000,
 103000,
 104000,
 105000,
 106000,
 107000,
 108000,
 109000,
 110000,
 111000,
 112000,
 113000,
 114000,
 115000,
 116000,
 117000,
 11800

### Loading in Generated Dataset from HuggingFace

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset = load_dataset('rdiehlmartinez/pythia-pile-presampled', 'checkpoints')

In [2]:
dataset = load_dataset('parquet', data_files='/home/rd654/rds/rds-personal-3CBQLhZjXbU/data/pile/parquet/shard_*.parquet', split='train', cache_dir='/rds-d7/user/rd654/hpc-work/hf_custom_cache/huggingface/datasets', num_proc=8)

Generating train split: 84399440 examples [09:01, 160953.41 examples/s]