# The 🤗 Datasets library

## What if my dataset isn’t on the Hub?

In [None]:
!python -m wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!python -m wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
!python -m gzip -dkv SQuAD_it-*.json.gz

In [None]:
!SQuAD_it-test.json.gz:	   87.4% -- replaced with SQuAD_it-test.json
!SQuAD_it-train.json.gz:	   82.2% -- replaced with SQuAD_it-train.json

### Loading a local dataset

In [None]:
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json")
squad_it_dataset

In [None]:
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")
squad_it_dataset

In [None]:
squad_it_dataset["train"][0]

In [None]:
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
squad_it_dataset

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
squad_it_dataset

## Time to slice and dice

### Slicing and dicing our data

In [None]:
!python -m wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"

In [None]:
from datasets import load_dataset

data_files = {"train": "drugsCom_raw/drugsComTrain_raw.tsv", "test": "drugsCom_raw/drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

In [None]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


drug_dataset.map(lowercase_condition)

In [None]:
def filter_nones(x):
    return x["condition"] is not None

In [None]:
lambda x : x * x

In [None]:
(lambda x: x * x)(3)

In [None]:
(lambda base, height: 0.5 * base * height)(4, 8)

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [None]:
drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

### Creating new columns

In [None]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [None]:
drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

In [None]:
drug_dataset["train"].sort("review_length")[:3]

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

In [None]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
drug_dataset = drug_dataset.map(lambda x: {"review" : html.unescape(x["review"])})

### The map() method’s superpowers

In [None]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

> list comprehensions are usually faster than executing the same code in a for loop

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [None]:
%time

tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

In [None]:
%%time

tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)

In [None]:
%%time
def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)
tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

In [None]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

In [None]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

In [None]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
%%time
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

In [None]:
drug_dataset.set_format("pandas")

In [None]:
drug_dataset["train"][:3]

In [None]:
train_df = drug_dataset["train"][:]

In [None]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

In [None]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

In [None]:
drug_dataset_clean.save_to_disk("drug-reviews")

In [None]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

In [None]:
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

In [None]:
!head -n 1 drug-reviews-train.jsonl

In [None]:
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

## Big data? 🤗 Datasets to the rescue!

### What is the Pile?

In [None]:
# !pip install zstandard

In [None]:
from datasets import load_dataset

pubmed_dataset = load_dataset("hwang2006/PUBMED_title_abstracts_2020_baseline", split="train")
pubmed_dataset

In [None]:
# !pip install psutil

In [None]:
import psutil

In [None]:
print(f"Number of files in dataset: {pubmed_dataset.dataset_size}")
size_gb = pubmed_dataset.dataset_size / (1024**3)
print(f"Dataset size (cache file): {size_gb:.2f} GB")

In [None]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(pubmed_dataset), batch_size):
    _ = pubmed_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

### Streaming datasets

In [None]:
pubmed_dataset_streamed = load_dataset(
    "json", data_files=data_files, split="train", streaming=True
)

In [None]:
next(iter(pubmed_dataset_streamed))

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

In [None]:
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [None]:
dataset_head = pubmed_dataset_streamed.take(5)
list(dataset_head)

In [None]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [None]:
law_dataset_streamed = load_dataset(
    "json",
    data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    split="train",
    streaming=True,
)
next(iter(law_dataset_streamed))

In [None]:
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])
list(islice(combined_dataset, 2))

In [None]:
base_url = "https://the-eye.eu/public/AI/pile/"
data_files = {
    "train": [base_url + "train/" + f"{idx:02d}.jsonl.zst" for idx in range(30)],
    "validation": base_url + "val.jsonl.zst",
    "test": base_url + "test.jsonl.zst",
}
pile_dataset = load_dataset("json", data_files=data_files, streaming=True)
next(iter(pile_dataset["train"]))

## Creating your own dataset

### Getting the data

In [5]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [6]:
response.status_code

200

In [7]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/7314',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/7314/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/7314/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/7314/events',
  'html_url': 'https://github.com/huggingface/datasets/pull/7314',
  'id': 2727502630,
  'node_id': 'PR_kwDODunzps6EkCi5',
  'number': 7314,
  'title': 'Resolved for empty datafiles',
  'user': {'login': 'sahillihas',
   'id': 20582290,
   'node_id': 'MDQ6VXNlcjIwNTgyMjkw',
   'avatar_url': 'https://avatars.githubusercontent.com/u/20582290?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/sahillihas',
   'html_url': 'https://github.com/sahillihas',
   'followers_url': 'https://api.github.com/users/sahillihas/followers',
   'following_url': 'https://api.g

In [None]:
GITHUB_TOKEN = xxx  # Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}