# Outline

In [47]:
import torch
import transformers

classifier = transformers.pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598050713539124},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

# Behind the Pipeline API

4 Stage Pipeline
1. dataset
2. language tokenizer
3. pretrained_model
4. model output
![](https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/full_nlp_pipeline.svg)

## 1. initialize tokenizer & model

In [48]:
raw_inputs = [
    "I love you so much",   # 5 Words
    "I hate you",            # 2 Words
    "hi",
]

In [49]:
tokenizer        = transformers. AutoTokenizer. from_pretrained("bert-base-cased")
tokenized_output = tokenizer(raw_inputs , padding=True , return_tensors="pt")    # Numeric ids => as PYTORCH TENSORS

print(tokenized_output['input_ids'])

tensor([[  101,   146,  1567,  1128,  1177,  1277,   102],
        [  101,   146,  4819,  1128,   102,     0,     0],
        [  101, 20844,   102,     0,     0,     0,     0]])


In [50]:
print(f'tokenizer returns multiple things things => {tokenized_output.keys()}')

index = 0
while index < len(tokenized_output['input_ids']):
    print(f"Row Number => {index+1}, \n\t input_ids \t=> { tokenized_output['input_ids'][index]}, \n\t attention_mask => {tokenized_output['attention_mask'][index]}")
    index = index + 1

tokenizer returns multiple things things => dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Row Number => 1, 
	 input_ids 	=> tensor([ 101,  146, 1567, 1128, 1177, 1277,  102]), 
	 attention_mask => tensor([1, 1, 1, 1, 1, 1, 1])
Row Number => 2, 
	 input_ids 	=> tensor([ 101,  146, 4819, 1128,  102,    0,    0]), 
	 attention_mask => tensor([1, 1, 1, 1, 1, 0, 0])
Row Number => 3, 
	 input_ids 	=> tensor([  101, 20844,   102,     0,     0,     0,     0]), 
	 attention_mask => tensor([1, 1, 1, 0, 0, 0, 0])


## 2. model

In [51]:
model_general         = transformers. AutoModel.                         from_pretrained("bert-base-cased")
model_classification  = transformers. AutoModelForSequenceClassification.from_pretrained("bert-base-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
classification_output = model_classification( **tokenized_output )

print(f'Model OUTPUT: {classification_output} ' )

Model OUTPUT: SequenceClassifierOutput(loss=None, logits=tensor([[-0.2201, -0.0825],
        [-0.2946, -0.1323],
        [-0.2332, -0.0325]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None) 


## 3. model output

In [53]:
predictions = torch.nn.functional.softmax(classification_output.logits, dim=-1)
print(predictions)

tensor([[0.4656, 0.5344],
        [0.4595, 0.5405],
        [0.4500, 0.5500]], grad_fn=<SoftmaxBackward0>)


# Complete Pipeline

In [54]:
import torch
import transformers

checkpoint = "bert-base-cased"
tokenizer  = transformers.AutoTokenizer.from_pretrained(checkpoint)
model      = transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[ 0.0203,  0.9130],
        [-0.0072,  0.9640]], grad_fn=<AddmmBackward0>)
tensor([[0.2905, 0.7095],
        [0.2747, 0.7253]], grad_fn=<SoftmaxBackward0>)
{0: 'LABEL_0', 1: 'LABEL_1'}


In [None]:
raw_inputs = [
    "I love you so much",   # 5 Words
    "screw you",            # 2 Words
]
numeric_ids = tokenizer(raw_inputs , padding=True , return_tensors="pt")    # Numeric ids => as PYTORCH TENSORS

outputs = model(**numeric_ids)

print(outputs.logits)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

print(model.config.id2label)