In [2]:
from datasets import concatenate_datasets, load_dataset
import re
from tqdm import tqdm

In [3]:
bookcorpus = load_dataset("bookcorpus")

In [4]:
wiki = load_dataset("wikipedia", "20220301.en")

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [5]:
bookcorpus = load_dataset("bookcorpus", split="train")
wiki = load_dataset("wikipedia", "20220301.en", split="train")
wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])  # only keep the 'text' column
 
assert bookcorpus.features.type == wiki.features.type
datasets = concatenate_datasets([bookcorpus, wiki])

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [15]:
def count_words(dataset):
    num_words = 0
    for data in tqdm(dataset):
        # \w+ matches one or more word characters (same as [a-zA-Z0-9_]+).
        num_words += len(re.findall(r'\w+', data["text"]))
    return num_words

def format_number(number):
    if abs(number) >= 1_000_000_000:
        return f"{number / 1_000_000_000:.2f}B"
    elif abs(number) >= 1_000_000:
        return f"{number / 1_000_000:.2f}M"
    else:
        return str(number)

In [11]:
wiki_num_words = count_words(wiki)

In [17]:
bookcorpus_num_words = count_words(bookcorpus)
total_num_words = wiki_num_words + bookcorpus_num_words
print(f"BooksCorpus # words: {format_number(bookcorpus_num_words)}")
print(f"Wikipedia (English) # words: {format_number(wiki_num_words)}")
print(f"Total # words: {format_number(total_num_words)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 74004228/74004228 [08:22<00:00, 147396.76it/s]

BooksCorpus # words: 840.01M
Wikipedia (English) # words: 3.17B
Total # words: 4.01B





# Inspect number of rows for tokenized datasets

In [1]:
from datasets import load_pretraining_dataset_disk

In [6]:
datasets

Dataset({
    features: ['text'],
    num_rows: 80462898
})

In [8]:
tokenized_dataset512 = load_from_disk("/media/bryan/ssd01/expr/bert_from_scratch/dataset_cache_seq512_seed0") 

Loading dataset from disk:   0%|          | 0/479 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/54 [00:00<?, ?it/s]

In [9]:
tokenized_dataset512

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 77641805
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8624032
    })
})

In [10]:
num_sequences = len(datasets)

In [11]:
test_length = int(0.1*num_sequences)
train_length = num_sequences - test_length
print("train_length: ", train_length, " test_length: ", test_length)

train_length:  72416609  test_length:  8046289
