<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/hf_datasets_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
'''
Adapted from the following tutorial: https://huggingface.co/docs/datasets/create_dataset

'''


from datasets import load_dataset_builder
from datasets import load_dataset
from datasets import get_dataset_split_names
from datasets import get_dataset_config_names
# Two main functions: load_dataset(string name) & load_dataset_builder(string name)

#   load_dataset_builder() loads a dataset builder andn inspect a dataset's attributes without committing to downloading it
ds_builder = load_dataset_builder("yelp/yelp_review_full")

print(ds_builder.info.features)

# load the actual dataset
dataset = load_dataset("yelp/yelp_review_full")

# can optinally specify a split parameter in load_dataset, i.e. load_dataset(name, split="train") and return Dataset object
# if you don't specify split, returns a DatasetDict object instead of a Dataset object

datasetDict = load_dataset("yelp/yelp_review_full")
print("datasetDict type: ", type(datasetDict))

datasetObj = load_dataset("yelp/yelp_review_full", split="train")
print("datasetObj type: ", type(datasetObj))


# get the split names of the dataset
get_dataset_split_names("yelp/yelp_review_full")

# some datasets have several sub-datasets, also known as configurations
configs = get_dataset_config_names("polyAI/minds14")
print(configs)

{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars']), 'text': Value('string')}
datasetDict type:  <class 'datasets.dataset_dict.DatasetDict'>
datasetObj type:  <class 'datasets.arrow_dataset.Dataset'>


README.md: 0.00B [00:00, ?B/s]

['all', 'cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']


In [19]:
# Dataset & Iterable Dataset
# Can access dataaset items using []
print(dataset['train'][0])

# indexing by column:
print(dataset['train'][0]['text'])

# can also do this:, prints all of data stored in text as a columnn object
print(dataset['train']['text'])

'''
Note that after doing dataset["train"] indexing order of [0]['text'] or ['text'][0] doesn't matter
'''

# Also supprots slicing
print("\n\n===Slicing===\n\n")
print(dataset['train'][:3])

{'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}
dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sit

In [34]:
'''
Iterable Datasets:
Allows one to access and use dataset without waiting for it to download completely
'''

# use streaming=True
iterable_dataset = load_dataset("ethz/food101", split="train", streaming=True)
for example in iterable_dataset:
  print(example)
  break

# creating an iterable dataset from existing datset. Faster with streaming if dataset is streamed from local files
iterable_dataset_existing = dataset['train'].to_iterable_dataset()

print(type(iterable_dataset_existing))

# cannot use indexing [] for random access on iterable dataset since data is streamed when you need it. Instead use next(iter())

# However supports column indexing that returns an iterable for the column values

print(next(iter(iterable_dataset["image"])))

# use IterableDataset.take() to create a new IterableDataset()

new_iterable_dataset = iterable_dataset.take(10)
print(new_iterable_dataset)

{'image': <PIL.Image.Image image mode=RGB size=384x512 at 0x7C7EFBF308D0>, 'label': 6}
<class 'datasets.iterable_dataset.IterableDataset'>
<PIL.Image.Image image mode=RGB size=384x512 at 0x7C7EFBFC0950>
IterableDataset({
    features: ['image', 'label'],
    num_shards: 8
})


In [54]:
'''
Tokenizers

'''

from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset = load_dataset("yelp/yelp_review_full", split="train")

# tokenizer returns dictionary with following three items: input_ids, token_type_ids, attention_mask
tokenizer(dataset[0]["text"])

# use .map() function to tokenize entire dataset, can also be called on a datasetDict
# use .set_format() to set VIEW to pytorch tensors

def tokenization(example):
  return tokenizer(example['text'])

subset = dataset[:10]     # indexing returns dict, not dataset
subset = dataset.select(range(10))  #  use .select to get a new Dataset so can call method

subset = subset.map(tokenization, batched=True)

# converts into pyTorch format
subset.set_format(type="torch", columns=['label', 'input_ids', 'token_type_ids', 'attention_mask'])





Map:   0%|          | 0/10 [00:00<?, ? examples/s]