# Datasets
The hugging face datasets library is a library that provides an API to quikly download many public datasets and preprocess them.



## Downloading a dataset


In [3]:
!pip install transformers evaluate datasets accelerate -q
from huggingface_hub import list_datasets,dataset_info
from datasets import load_dataset, DatasetInfo
all_datasets = list_datasets(sort="downloads",direction=-1,limit=5)

In [5]:
next(all_datasets)

DatasetInfo(id='hails/mmlu_no_train', author='hails', sha='7f9d4f237bd7496914f430fa600c73017331885f', created_at=datetime.datetime(2023, 10, 31, 17, 25, 54, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2024, 1, 22, 20, 46, 30, tzinfo=datetime.timezone.utc), private=False, gated=False, disabled=False, downloads=8539552, likes=7, paperswithcode_id=None, tags=['task_categories:question-answering', 'language:en', 'license:mit', 'region:us'], card_data=None, siblings=None)

In [8]:
raw_datasets = load_dataset("glue",'mrpc') #mrpc dataset from the glue benchmark. The task is to determine the pharaphrases

In [9]:
raw_datasets #Returns a sort of Dict containing each split of our dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [10]:
raw_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [12]:
raw_datasets['train'][600]

{'sentence1': "There will be no vote on the issue but those opposed to Robinson 's appointment are thought to outnumber those who accept it by around 20 to 17 .",
 'sentence2': "There will be no vote on the issue but those opposed to Robinson 's appointment are thought to be in the majority .",
 'label': 1,
 'idx': 673}

In [14]:
raw_datasets['test'][:5]

{'sentence1': ["PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .",
  "The world 's two largest automakers said their U.S. sales declined more than predicted last month as a late summer sales frenzy caused more of an industry backlash than expected .",
  'According to the federal Centers for Disease Control and Prevention ( news - web sites ) , there were 19 reported cases of measles in the United States in 2002 .',
  'A tropical storm rapidly developed in the Gulf of Mexico Sunday and was expected to hit somewhere along the Texas or Louisiana coasts by Monday night .',
  "The company didn 't detail the costs of the replacement and repairs ."],
 'sentence2': ['Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .',
  'Domestic sales at both GM and No. 2 Ford Motor Co. declined more than predicted as a late summer sales frenzy prompted a larger-than-expec

We can get more information about the features (the columns)

In [15]:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

To preprocess all the elements of our dataset, we need to tokenize them

In [16]:
from transformers import AutoTokenizer

In [17]:
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [20]:
def tokenize_function(example):
    return tokenizer(
        example['sentence1'],example['sentence2'],padding='max_length',truncation=True,max_length=128
    )

In [26]:
tokenized_datasets = raw_datasets.map(tokenize_function,batched=True)

print(tokenized_datasets.column_names)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map: 100%|██████████| 3668/3668 [00:00<00:00, 8913.93 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 8177.59 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 5864.64 examples/s]

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}





In [27]:
tokenized_datasets['train'][100]

{'sentence1': 'The Nasdaq composite index inched up 1.28 , or 0.1 percent , to 1,766.60 , following a weekly win of 3.7 percent .',
 'sentence2': 'The technology-laced Nasdaq Composite Index .IXIC was off 24.44 points , or 1.39 percent , at 1,739.87 .',
 'label': 0,
 'idx': 114,
 'input_ids': [101,
  1109,
  11896,
  1116,
  1810,
  4426,
  14752,
  7448,
  4305,
  1174,
  1146,
  122,
  119,
  1743,
  117,
  1137,
  121,
  119,
  122,
  3029,
  117,
  1106,
  122,
  117,
  5465,
  1545,
  119,
  2539,
  117,
  1378,
  170,
  5392,
  1782,
  1104,
  124,
  119,
  128,
  3029,
  119,
  102,
  1109,
  2815,
  118,
  19498,
  11896,
  1116,
  1810,
  4426,
  3291,
  24729,
  13068,
  10146,
  119,
  12607,
  9741,
  1108,
  1228,
  1572,
  119,
  3140,
  1827,
  117,
  1137,
  122,
  119,
  3614,
  3029,
  117,
  1120,
  122,
  117,
  5766,
  1580,
  119,
  5966,
  119,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [30]:
tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets = tokenized_datasets.with_format('tensorflow')

2024-05-13 23:26:49.512592: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-13 23:26:49.581179: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


We rename the label (since the models from Hugging Face Transformers expect that) and set the output format to our desired backend (in this case tensor flow)

In [31]:
tokenized_datasets['train']

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

If needed we can also generate a small sample of a datasete using the select method

In [32]:
small_train_dataset = tokenized_datasets['train'].select(range(100))