In [25]:
# Based on full course
# https://huggingface.co/learn/nlp-course/chapter0/1


![alt text](images/tokenizer.jpg)

In [1]:
from transformers import pipeline
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import AutoModel

2024-04-12 15:36:01.340200: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Recall
sa = pipeline("sentiment-analysis",model = "distilbert-base-uncased-finetuned-sst-2-english")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [3]:
# Use the transformer just like last notebook
raw_inputs = [
    "I really love this class!",
    "I hate long, boring lectures.",
]
sa(raw_inputs)

[{'label': 'POSITIVE', 'score': 0.9998805522918701},
 {'label': 'NEGATIVE', 'score': 0.9988584518432617}]

In [4]:
# Look inside the Pipeline configuration
sa.model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.31.0",
  "vocab_size": 30522
}

In [6]:
# Let's create a tokenizer
from transformers import AutoTokenizer

# Use the same model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# Extract just the tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
# Use the tokenizer by passing it our 2 sentences
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt") # use pytorch tensors
# Look at the sentences that are now tokenized
print("Tokenized sentences:", inputs['input_ids'])
# Look at the attention mask, this tells the network which tokens to pay attention to 
print('Attention tensor:', inputs['attention_mask'])

Tokenized sentences: tensor([[  101,  1045,  2428,  2293,  2023,  2465,   999,   102,     0],
        [  101,  1045,  5223,  2146,  1010, 11771,  8921,  1012,   102]])
Attention tensor: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [21]:
from transformers import AutoModel
# Create just the model
#model = AutoModel.from_pretrained(checkpoint, output_hidden_states=True,output_attentions=True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, output_hidden_states=True,output_attentions=True)

In [22]:
# Inference the model. Send it the tokenized sentences
outputs = model(**inputs) # Give the model all of our inputs (** just means unpack the dictionary)
print("What is the type of the outputs?", type(outputs))
type(outputs.hidden_states)
# Uncomment if you want to see the raw output from the model
#outputs.hidden_states

What is the type of the outputs? <class 'transformers.modeling_outputs.SequenceClassifierOutput'>


tuple

![alt text](images/heads.jpg)

In [23]:
# This is our output, but it is still tokenized
print(outputs.logits.shape)
print(outputs.logits)

torch.Size([2, 2])
tensor([[-4.3496,  4.6831],
        [ 3.6689, -3.1053]], grad_fn=<AddmmBackward0>)


In [24]:
# Post process the tokens (Convert logits to probabilities)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
# Print results
print('In this model:', model.config.id2label)
print('Raw Probability Predictions in the tensors:\n', predictions)
print('\n', raw_inputs,'\n')
for t in predictions.tolist():
    print("Negative Probability:", t[0], "Positivie Probability:", t[1])

In this model: {0: 'NEGATIVE', 1: 'POSITIVE'}
Raw Probability Predictions in the tensors:
 tensor([[1.1943e-04, 9.9988e-01],
        [9.9886e-01, 1.1415e-03]], grad_fn=<SoftmaxBackward0>)

 ['I really love this class!', 'I hate long, boring lectures.'] 

Negative Probability: 0.00011942967830691487 Positivie Probability: 0.9998805522918701
Negative Probability: 0.9988584518432617 Positivie Probability: 0.0011415336048230529


![alt text](images/tokenizer.jpg)