In [1]:
## Surpress logging
import warnings
warnings.filterwarnings('ignore')
import logging
import re
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    """
    Override logging levels of different modules based on their name as a prefix.
    It needs to be invoked after the modules have been loaded so that their loggers have been initialized.

    Args:
        - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR
        - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional.
          Default is `[""]` to match all active loggers.
          The match is a case-sensitive `module_name.startswith(prefix)`
    """
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)
set_global_logging_level(logging.ERROR)

In [46]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer


## Pipeline 

In [3]:
# Using default models
classifier = pipeline("zero-shot-classification")
classifier(
    "This compound is sythesised using polymerization reactions",
    candidate_labels=["physics", "chemistry", "biology"],
)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'sequence': 'This compound is sythesised using polymerization reactions',
 'labels': ['chemistry', 'physics', 'biology'],
 'scores': [0.9724298715591431, 0.018467111513018608, 0.009102925658226013]}

In [4]:
# Using specified models
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
classifier(
    "This compound is sythesised using polymerization reactions",
    candidate_labels=["physics", "chemistry", "biology"],
)

{'sequence': 'This compound is sythesised using polymerization reactions',
 'labels': ['chemistry', 'physics', 'biology'],
 'scores': [0.9724298715591431, 0.018467111513018608, 0.009102925658226013]}

In [14]:
# Pipeline under the hood

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

outputs = model(**inputs)
print(outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[4.0195e-02, 9.5981e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [12]:
model.config.id2label


{0: 'NEGATIVE', 1: 'POSITIVE'}

## Fine-tuning

In [16]:
# Preproces data
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Using the latest cached version of the module from /Users/anyangpeng/.cache/huggingface/modules/datasets_modules/datasets/glue/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad (last modified on Sat Jul  1 19:21:20 2023) since it couldn't be found locally at glue., or remotely on the Hugging Face Hub.


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [21]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, num_proc=2)
tokenized_datasets

Map (num_proc=2):   0%|          | 0/3668 [00:00<?, ? examples/s]

Map (num_proc=2): 100%|██████████| 3668/3668 [00:00<00:00, 5118.93 examples/s]
Map (num_proc=2): 100%|██████████| 408/408 [00:00<00:00, 2158.17 examples/s]
Map (num_proc=2): 100%|██████████| 1725/1725 [00:00<00:00, 4515.59 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [39]:
[len(data) for data in tokenized_datasets['train'][:8]['input_ids']]

[50, 59, 47, 67, 59, 50, 62, 32]

In [43]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [48]:
training_args = TrainingArguments(output_dir="demo_training")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()


                                     
  0%|          | 0/3 [04:05<?, ?it/s]             

{'loss': 0.5341, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


                                     
  0%|          | 0/3 [07:51<?, ?it/s]              

{'loss': 0.2836, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


                                     
100%|██████████| 1377/1377 [10:22<00:00,  2.21it/s]

{'train_runtime': 622.5992, 'train_samples_per_second': 17.674, 'train_steps_per_second': 2.212, 'train_loss': 0.3370742465416071, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.3370742465416071, metrics={'train_runtime': 622.5992, 'train_samples_per_second': 17.674, 'train_steps_per_second': 2.212, 'train_loss': 0.3370742465416071, 'epoch': 3.0})