<a href="https://colab.research.google.com/github/amir-asari/Introduction_to_Huggingface/blob/main/3_FineTuningaPretrainedModel/2_Processing_the_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Huggingface standard pipeline

**Standard Pipeline**
```python
import torch
import transformers as huggingface_transformers

checkpoint  = "bert-base-uncased"
# STEP 1: TOKENIZER. (WORD TO NUMERIC UNIQUE ID)
tokenizer   = huggingface_transformers.AutoTokenizer                        .from_pretrained(checkpoint)
# STEP 2: PRE TRAINED MODEL (NUM_ID to MEANING_VECTOR OF EACH WORD -> FURTHER PROCESSING)
model       = huggingface_transformers.AutoModel For Sequence Classification   .from_pretrained(checkpoint)

# STEP 3: LIST OF SENTENCES
input_sequences   = [
                "I've been waiting for a HuggingFace course my whole life.",
                "This course is amazing!",
              ]

# STEP 4: SENTENCE OF WORDS -> NUMERIC_IDS -> MODEL
tokenized_batch_tensors = tokenizer(input_sequences, padding=True, truncation=True, return_tensors="pt")

y_predicted = model ( **tokenized_batch_tensors )
print(y_predicted)

```

In [None]:
import torch
import datasets     as huggingface_datasets
import transformers as huggingface_transformers

checkpoint              = "bert-base-uncased"
tokenizer_pretrained    = huggingface_transformers.AutoTokenizer.from_pretrained(checkpoint)

## Downloading & exploring Dataset

**Important NLP Datasets, must know for everyone**

In [None]:
raw_datasets = huggingface_datasets.load_dataset("glue", "mrpc")

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [None]:
train, validation, test = raw_datasets['train'], raw_datasets['validation'], raw_datasets['test']

In [None]:
train

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [None]:
train[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
train[0:3]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale ."],
 'label': [1, 0, 1],
 'idx': [0, 1, 2]}

In [None]:
vars(train)

{'_info': DatasetInfo(description='', citation='', homepage='', license='', features={'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='parquet', dataset_name='glue', config_name='mrpc', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=944761, num_examples=3668, shard_lengths=None, dataset_name='glue'), 'validation': SplitInfo(name='validation', num_bytes=105981, num_examples=408, shard_lengths=None, dataset_name='glue'), 'test': SplitInfo(name='test', num_bytes=442842, num_examples=1725, shard_lengths=None, dataset_name='glue')}, download_checksums={'hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/mrpc/train-00000-of-00001.parquet': {'num_bytes': 649281, 'checksum': None}, 'hf://datasets/glue@bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/

In [None]:
train.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

## fixed padding tokenization

In [None]:
%time

# Tokenize, One by One. Slower
tokenized_dataset = tokenizer_pretrained( raw_datasets["train"]["sentence1"], raw_datasets["train"]["sentence2"],
                                    padding=True,   truncation=True )

tokenized_dataset.keys()

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 1.91 µs


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

## fixed padding, accelerated with `map`, tokenization

In [None]:
%time
# Tokenize via GPU parallelization operation `map`

def tokenize_function(example):
    tokenized_example = tokenizer_pretrained(  example["sentence1"], example["sentence2"],
                                        padding=True , truncation=True )
    return tokenized_example

# ACCELERATED / PARALLEL PROCESSING
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Details of TOKENIZED DATASET
tokenized_dataset.keys()

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 2.15 µs


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

## dynamic padding per batch level, tokenization

In [None]:
%time
# Tokenize via GPU parallelization operation `map`

def tokenize_function(example):
    # Removing static padding per batch
    tokenized_example = tokenizer_pretrained(  example["sentence1"], example["sentence2"],
                                        truncation=True )
    return tokenized_example

# ACCELERATED / PARALLEL PROCESSING
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

tokenized_dataset.keys()

data_collator       = huggingface_transformers.DataCollatorWithPadding(tokenizer= tokenizer_pretrained   )
train_dataloader    = torch.utils.data.DataLoader(
                                                    collate_fn = data_collator,             # Data Collator for dynamic padding inside pytorch
                                                    dataset = tokenized_datasets['train'])

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 1.91 µs


Map:   0%|          | 0/408 [00:00<?, ? examples/s]