# Datasets to the rescue!

## Pile

In [1]:
# !pip install zstandard

In [2]:
from datasets import load_dataset, DownloadConfig

In [26]:
urls = [
    "https://huggingface.co/datasets/EleutherAI/pile/resolve/refs%2Fconvert%2Fparquet/hacker_news/pile-train-00000-of-00004.parquet",
    "https://huggingface.co/datasets/EleutherAI/pile/resolve/refs%2Fconvert%2Fparquet/hacker_news/pile-train-00001-of-00004.parquet",
    "https://huggingface.co/datasets/EleutherAI/pile/resolve/refs%2Fconvert%2Fparquet/hacker_news/pile-train-00002-of-00004.parquet",
    "https://huggingface.co/datasets/EleutherAI/pile/resolve/refs%2Fconvert%2Fparquet/hacker_news/pile-train-00003-of-00004.parquet"
]


In [27]:
pile_dataset = load_dataset("parquet", data_files=urls, split="train")

In [28]:
pile_dataset[1]

 'meta': "{'id': '685596'}"}

In [12]:
pile_dataset.dataset_size / (1024 ** 3)

1.596497755497694

## Memory Mapping

In [10]:
# !pip install psutil

In [13]:
import psutil

In [14]:
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 49.93 MB


In [15]:
print(f"Number of files in dataset: {pubmed_dataset.dataset_size}")
size_gb = pubmed_dataset.dataset_size / (1024 ** 3)
print(f"Dataset size (cache file): {size_gb: .2f} GB")

Number of files in dataset: 1714226412
Dataset size (cache file):  1.60 GB


## Iterating over data

In [16]:
import timeit

In [21]:
code_snippet = """batch_size = 1000
for idx in range(0, len(pile_dataset), batch_size):
    _ = pile_dataset[idx: idx + batch_size]
"""

In [22]:
time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())

In [23]:
print(
    f"Iterated over {len(pile_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 373027 examples (about 1.6 GB) in 22.5s i.e. 0.071 GB/s


## Stream Datasets

In [29]:
large_dataset_streamed = load_dataset(
    "parquet", data_files=urls, split="train", streaming=True
)

In [30]:
next(iter(large_dataset_streamed))

 'meta': "{'id': '19979654'}"}

In [31]:
type(large_dataset_streamed)

datasets.iterable_dataset.IterableDataset

In [32]:
from transformers import AutoTokenizer

In [34]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = large_dataset_streamed.map(lambda x: tokenizer(x["text"]))

In [40]:
next(iter(tokenized_dataset))['input_ids'][:10]

[101, 2859, 17210, 6221, 8398, 1011, 28549, 22407, 2620, 2683]

In [38]:
next(iter(tokenized_dataset))['attention_mask'][:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [42]:
dataset_head = large_dataset_streamed.take(5)

In [46]:
len(list(dataset_head))

5

In [47]:
train_dataset = large_dataset_streamed.skip(1000)
validation_dataset = large_dataset_streamed.take(1000)

In [50]:
sum(1 for _ in train_dataset), sum(1 for _ in validation_dataset)

(372027, 1000)

## Download dataset

In [84]:
ultrachat_dataset = load_dataset("stingning/ultrachat", split="train", save_infos=True)

In [85]:
ultrachat_dataset

Dataset({
    features: ['id', 'data'],
    num_rows: 1468352
})

In [87]:
ultrachat_dataset.dataset_size / (1024 ** 3)

8.565430543385446

In [88]:
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 92.02 MB


In [90]:
code_snippet = """batch_size = 1000

for idx in range(0, len(ultrachat_dataset), batch_size):
    _ = ultrachat_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(ultrachat_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s")

Iterated over 1468352 examples (about 1.6 GB) in 205.8s, i.e. 0.008 GB/s


## Streaming Dataset

In [94]:
mc4_dataset = load_dataset("mc4", "en", split="train", streaming=True)

In [95]:
next(iter(mc4_dataset))

{'text': 'Posts 4,362\tMore Info\nOkay so to those of you that were very helpful this is not to you but for those of you that laugh when I ask about ohms or powering LSi15\'s this is to you. If you know a book, website, or someone to talk to to get more info that I seek so I know what some of you are talking about please share it with me. I ask questions to gain more info on audio thats all. Not to get laughed at when asking it. So if anyone has any good info they would like to share with me please do. Otherwise leave you smart coments to yourself. Thank You Your Freind Matt :)\nLast edited by bigaudiofanatic; 10-19-2007 at 02:23 AM.\nReply With Quote 10-19-2007 04:50 AM\nPosts 5,247\tI am not sure if I qualify but - Click the search button at the top, put in "LSi15 power 4 ohm" without the quotes and see what you come up with. You might also try "LSi15 AVR" - you will have alot more to filter through but look for threads talking about "what reciever or AVR should I buy".\nLast edited 

In [96]:
from transformers import AutoTokenizer

In [102]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [104]:
%timeit tokenized_dataset = mc4_dataset.map(lambda x: tokenizer(x['text']))

175 µs ± 61 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [99]:
from operator import itemgetter

In [101]:
itemgetter('input_ids', 'attention_mask')(next(iter(tokenized_dataset)))

([101,
  8466,
  1018,
  1010,
  4029,
  2475,
  2062,
  18558,
  3100,
  2061,
  2000,
  2216,
  1997,
  2017,
  2008,
  2020,
  2200,
  14044,
  2023,
  2003,
  2025,
  2000,
  2017,
  2021,
  2005,
  2216,
  1997,
  2017,
  2008,
  4756,
  2043,
  1045,
  3198,
  2055,
  2821,
  5244,
  2030,
  2373,
  2075,
  1048,
  5332,
  16068,
  1005,
  1055,
  2023,
  2003,
  2000,
  2017,
  1012,
  2065,
  2017,
  2113,
  1037,
  2338,
  1010,
  4037,
  1010,
  2030,
  2619,
  2000,
  2831,
  2000,
  2000,
  2131,
  2062,
  18558,
  2008,
  1045,
  6148,
  2061,
  1045,
  2113,
  2054,
  2070,
  1997,
  2017,
  2024,
  3331,
  2055,
  3531,
  3745,
  2009,
  2007,
  2033,
  1012,
  1045,
  3198,
  3980,
  2000,
  5114,
  2062,
  18558,
  2006,
  5746,
  2008,
  2015,
  2035,
  1012,
  2025,
  2000,
  2131,
  4191,
  2012,
  2043,
  4851,
  2009,
  1012,
  2061,
  2065,
  3087,
  2038,
  2151,
  2204,
  18558,
  2027,
  2052,
  2066,
  2000,
  3745,
  2007,
  2033,
  3531,
  2079,
  1012,
  4

In [105]:
%timeit tokenized_dataset = mc4_dataset.map(lambda x: tokenizer(x['text']), batched=True)

149 µs ± 37 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [106]:
shuffled_dataset = mc4_dataset.shuffle(buffer_size=10_000, seed=42)

In [107]:
next(iter(shuffled_dataset))

{'text': 'Broad set for knee surgery (From Hampshire Chronicle)\nStuart Broad will have knee surgery next month\nEngland bowler Stuart Broad is to undergo surgery on his right knee on September 4, the England and Wales Cricket Board has announced.\nThe ECB revealed earlier this month that the 28-year-old paceman, who suffered a broken nose in England\'s innings and 54-run fourth Test victory over India, was to have surgery to address a long-standing patella tendonitis problem and a date has now been confirmed for the procedure.\nA statement issued by the ECB on Wednesday said: "The ECB medical team confirmed that England pace bowler Stuart Broad is to have surgery on his right knee on Thursday 4 September. A rehabilitation and recovery programme will be determined following the operation."\nBroad has been troubled by the condition in his right knee for well over a year but has consistently put off going under the knife until now with a relatively quiet winter period coming up.\nHis reh

In [108]:
dataset_head = mc4_dataset.take(5)

In [109]:
list(dataset_head)

[{'text': 'Posts 4,362\tMore Info\nOkay so to those of you that were very helpful this is not to you but for those of you that laugh when I ask about ohms or powering LSi15\'s this is to you. If you know a book, website, or someone to talk to to get more info that I seek so I know what some of you are talking about please share it with me. I ask questions to gain more info on audio thats all. Not to get laughed at when asking it. So if anyone has any good info they would like to share with me please do. Otherwise leave you smart coments to yourself. Thank You Your Freind Matt :)\nLast edited by bigaudiofanatic; 10-19-2007 at 02:23 AM.\nReply With Quote 10-19-2007 04:50 AM\nPosts 5,247\tI am not sure if I qualify but - Click the search button at the top, put in "LSi15 power 4 ohm" without the quotes and see what you come up with. You might also try "LSi15 AVR" - you will have alot more to filter through but look for threads talking about "what reciever or AVR should I buy".\nLast edited

In [110]:
train_dataset = mc4_dataset.skip(1000)
validation_dataset = mc4_dataset.take(1000)

In [111]:
wiki_dataset = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)

Downloading builder script:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

In [112]:
next(iter(wiki_dataset))

{'id': '12',
 'url': 'https://en.wikipedia.org/wiki/Anarchism',
 'title': 'Anarchism',
 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latte

In [113]:
from itertools import islice

In [114]:
from datasets import interleave_datasets

In [115]:
combined_datasets = interleave_datasets([mc4_dataset, wiki_dataset])

In [116]:
list(islice(combined_datasets, 2))

[{'text': 'Posts 4,362\tMore Info\nOkay so to those of you that were very helpful this is not to you but for those of you that laugh when I ask about ohms or powering LSi15\'s this is to you. If you know a book, website, or someone to talk to to get more info that I seek so I know what some of you are talking about please share it with me. I ask questions to gain more info on audio thats all. Not to get laughed at when asking it. So if anyone has any good info they would like to share with me please do. Otherwise leave you smart coments to yourself. Thank You Your Freind Matt :)\nLast edited by bigaudiofanatic; 10-19-2007 at 02:23 AM.\nReply With Quote 10-19-2007 04:50 AM\nPosts 5,247\tI am not sure if I qualify but - Click the search button at the top, put in "LSi15 power 4 ohm" without the quotes and see what you come up with. You might also try "LSi15 AVR" - you will have alot more to filter through but look for threads talking about "what reciever or AVR should I buy".\nLast edited

In [124]:
indian_languages = ["hi", "bn", "gu", "kn", "ml", "te", "ta", "or", "bh", "mr", "pa"]
oscar_indian_language_dataset = {}
for lang in indian_languages:
    oscar_indian_language_dataset[lang] = load_dataset("oscar", f"unshuffled_deduplicated_{lang}", split="train", streaming=True)

In [125]:
combined_dataset = interleave_datasets(oscar_indian_language_dataset.values())

In [126]:
list(combined_dataset.take(11))

[{'id': 0,
  'text': "'आइटम गर्ल' बनकर हिट हुई थीं राखी सावंत, आज करीना-कटरीना तक फॉलो कर रही हैं ट्रेंड नक्\u200dसलियों का दम निकालेगा बाइक ग्रेनेड लॉन्चर, एक जवान 100 नक्सलियों पर पड़ेगा भारी ICC रैंकिंग: भारत नंबर 2 पर बरकरार, कुलदीप की बेस्ट रेटिंग आरोपों से अक्षय का इंकार, गुरुग्रंथ साहिब बेअदबी मामले में दी सफाई\nजबलपुर। सृजनशील एवं प्रगतिशील युवक-युवतियों हेतु खाद्य प्रसंस्करण, सूचना प्रौद्योगिकी स्मार्ट फोन मरम्मत पर आधारित तकनीकी उद्यमिता विकास कार्यक्रम के आयोजन हेतु उद्यमिता विकास केन्द्र म.प्र. (सेडमैप) क्षेत्रीय प्रशिक्षण कार्यालय उद्योग भवन कटंगा में १६ अगस्त तक इच्छुक व्यक्ति आवेदन जमा कर सकता है। इस तकनीकी पर आधारित उद्यमिता विकास प्रशिक्षण कार्यक्रम के प्रायोजक विज्ञान एवं तकनीकी विभाग भारत सरकार नई दिल्ली हैं जबकि सहयोग जिला व्यापार एवं उद्योग केन्द्र जबलपुर का है।\nइस कार्यक्रम का उद्देश्य नई तकनीकी एवं नवीन प्रौेद्योगिकी के आधार पर कृृषि से प्राप्त होने वाले खाद्यान दालें, फल-फूल, सब्जियां एवं कृषि से सम्बद्ध उत्पादों के आधार पर युवाओं की योग्यता एवं क्षमता अनुसर वि