## AutoModel class for more flexibility

In [50]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [53]:
# ------------------------------
# Load the tokenizer and pre-trained model
# ------------------------------
model_name = "textattack/distilbert-base-uncased-SST-2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

text = ["The best movie I've ever watched!", "What an awful movie. I regret watching it."]

# ------------------------------
# Tokenize inputs and pass them to the model for inference
# ------------------------------
inputs = tokenizer(text, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits = outputs.logits

predicted_classes = torch.argmax(logits, dim=1).tolist()

# print the predicted probabilities and class names
probs = torch.nn.functional.softmax(logits, dim=1)

for i, prob in enumerate(probs):
    print(f"Predicted probabilities for \"{text[i]}\": {prob}")
    print(f"Predicted class name for \"{text[i]}\": {model.config.id2label[predicted_classes[i]]}")



Predicted probabilities for "The best movie I've ever watched!": tensor([0.4189, 0.5811], grad_fn=<UnbindBackward0>)
Predicted class name for "The best movie I've ever watched!": LABEL_1
Predicted probabilities for "What an awful movie. I regret watching it.": tensor([0.8513, 0.1487], grad_fn=<UnbindBackward0>)
Predicted class name for "What an awful movie. I regret watching it.": LABEL_0


## What does the tokenizer class do

In [58]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

text = ["The best movie I've ever watched!", "What an awful movie. I regret watching it."]

print(tokenizer(text, return_tensors="pt", padding=True)  ) # padding=True adds padding tokens to the input
# either max_length or padding can be used to specify the maximum length of the input, but not both


tokens = tokenizer.tokenize(text[0])
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

decoded = tokenizer.decode(ids)
print(decoded)

# attention mask = 0 means ignore the token, 1 means consider the token
# padding mask = 0 means the token is a padding token, 1 means it is not

# decoding removes the capitalization and adds spaces between tokens
# notice two tokens are added at the beginning and end of the sentence
# [CLS] and [SEP] tokens are added to the beginning and end of the sentence


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[ 101, 1996, 2190, 3185,  102],
        [ 101, 2054, 2019, 9643,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}
['the', 'best', 'movie', 'i', "'", 've', 'ever', 'watched', '!']
[1996, 2190, 3185, 1045, 1005, 2310, 2412, 3427, 999]
the best movie i've ever watched!


## pipeline vs automodel
Sentiment analysis with and without pipeline

In [10]:
# --------------------
# Import pipeline
# --------------------
from transformers import pipeline

# --------------------
# Create the task pipeline
# --------------------
model = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model)
task_pipeline = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", tokenizer=tokenizer)

text = "this a non sentence. I am not sure what to do with it."

# --------------------
# Predict the sentiment
# --------------------
task_output = task_pipeline(text)

print(f"Sentiment from task_pipeline: {task_output[0]['label']}, Confidence from task_pipeline: {task_output[0]['score']}")

Sentiment from task_pipeline: NEGATIVE, Confidence from task_pipeline: 0.9996139407157898


In [26]:
# ----------------------------------------
# Now using the tokenizer to tokenize the text
# ----------------------------------------

X_train = ["I love this movie because it is meant for me!", 
"I hate this movie.  It did not have a sequence or a plot.", 
"I don't know how I feel about this movie."]

tokens = tokenizer.tokenize(X_train[2])
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

decoded = tokenizer.decode(ids)
print(decoded)

['i', 'don', "'", 't', 'know', 'how', 'i', 'feel', 'about', 'this', 'movie', '.']
[1045, 2123, 1005, 1056, 2113, 2129, 1045, 2514, 2055, 2023, 3185, 1012]
i don't know how i feel about this movie.


In [34]:

X_train = ["I love this movie because it is meant for me!", 
"I hate this movie.  It did not have a sequence or a plot.", 
"I don't know how I feel about this movie."]

print(task_pipeline(X_train))

# play with the parameters of the tokenizer
batch = tokenizer(X_train, padding=True, truncation=True,max_length = 20, return_tensors="pt")
print(batch)

[{'label': 'POSITIVE', 'score': 0.9998345375061035}, {'label': 'NEGATIVE', 'score': 0.9997679591178894}, {'label': 'NEGATIVE', 'score': 0.9971460700035095}]
{'input_ids': tensor([[ 101, 1045, 2293, 2023, 3185, 2138, 2009, 2003, 3214, 2005, 2033,  999,
          102,    0,    0,    0,    0],
        [ 101, 1045, 5223, 2023, 3185, 1012, 2009, 2106, 2025, 2031, 1037, 5537,
         2030, 1037, 5436, 1012,  102],
        [ 101, 1045, 2123, 1005, 1056, 2113, 2129, 1045, 2514, 2055, 2023, 3185,
         1012,  102,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}


In [33]:
# ------------------------------------------------------------------------------------------------------------------------
# Using AutoModelForSequenceClassification to load the model and pass the batch to the model for inference
# ------------------------------------------------------------------------------------------------------------------------


model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

batch = tokenizer(X_train, padding=True, truncation=True,max_length = 20, return_tensors="pt")
print(batch)

with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1)
    print(probs)



{'input_ids': tensor([[ 101, 1045, 2293, 2023, 3185, 2138, 2009, 2003, 3214, 2005, 2033,  999,
          102,    0,    0,    0,    0],
        [ 101, 1045, 5223, 2023, 3185, 1012, 2009, 2106, 2025, 2031, 1037, 5537,
         2030, 1037, 5436, 1012,  102],
        [ 101, 1045, 2123, 1005, 1056, 2113, 2129, 1045, 2514, 2055, 2023, 3185,
         1012,  102,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-4.1754,  4.5310],
        [ 4.6287, -3.7397],
        [ 3.2484, -2.6078]]), hidden_states=None, attentions=None)
tensor([[1.6549e-04, 9.9983e-01],
        [9.9977e-01, 2.3204e-04],
        [9.9715e-01, 2.8539e-03]])


## Summary Comparison of AutoModels and Pipeline

In [46]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Using the pipeline class
def use_pipeline():
    # Create a sentiment analysis pipeline
    sentiment_pipeline = pipeline("sentiment-analysis")

    # Define some example sentences
    sentences = [
        "I love this movie! It's fantastic.",
        "The book was okay, but not great.",
        "I had a terrible experience at the restaurant."
    ]

    # Use the pipeline to analyze sentiment
    results = sentiment_pipeline(sentences)

    # Print the results
    for result in results:
        print(f"Sentence: {result['label']}, Sentiment: {result['score']:.2f}")

# Using the AutoModel class
def use_automodel():
    # Load the pre-trained model and tokenizer
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer1 = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Define some example sentences
    sentences = [
        "I love this movie! It's fantastic.",
        "The book was okay, but not great.",
        "I had a terrible experience at the restaurant."
    ]

    # Tokenize the sentences
    encoded_inputs = tokenizer1(sentences, padding=True, truncation=True, return_tensors="pt")

    # Make predictions
    outputs = model(**encoded_inputs)
    predicted_labels = outputs.logits.argmax(dim=1)

    # Print the results
    for sentence, label in zip(sentences, predicted_labels):
        sentiment = "Positive" if label == 1 else "Negative"
        print(f"Sentence: {sentence}, Sentiment: {sentiment}")

# Run the examples
print("Using the pipeline class:")
use_pipeline()

print("\nUsing the AutoModel class:")
use_automodel()

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Using the pipeline class:
Sentence: POSITIVE, Sentiment: 1.00
Sentence: NEGATIVE, Sentiment: 1.00
Sentence: NEGATIVE, Sentiment: 1.00

Using the AutoModel class:
Sentence: I love this movie! It's fantastic., Sentiment: Positive
Sentence: The book was okay, but not great., Sentiment: Negative
Sentence: I had a terrible experience at the restaurant., Sentiment: Negative


In [49]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# dir(tokenizer)[-40:]
sentences = [
        "I love this movie! It's fantastic.",
        "The book was okay, but not great.",
        "I had a terrible experience at the restaurant."
    ]
# Tokenize the sentences
encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt",   max_length = 15, )
encoded_inputs


{'input_ids': tensor([[  101,  1045,  2293,  2023,  3185,   999,  2009,  1005,  1055, 10392,
          1012,   102],
        [  101,  1996,  2338,  2001,  3100,  1010,  2021,  2025,  2307,  1012,
           102,     0],
        [  101,  1045,  2018,  1037,  6659,  3325,  2012,  1996,  4825,  1012,
           102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [45]:
from transformers import AutoTokenizer
help(AutoTokenizer)

Help on class AutoTokenizer in module transformers.models.auto.tokenization_auto:

class AutoTokenizer(builtins.object)
 |  This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
 |  created with the [`AutoTokenizer.from_pretrained`] class method.
 |  
 |  This class cannot be instantiated directly using `__init__()` (throws an error).
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False)
 |      Register a new tokenizer in this mapping.
 |      
 |      
 |      Args:
 |          config_class ([`PretrainedConfig`]):
 |              The configuration corresponding to the model to register.
 |          slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
 |              The slow tokenizer to register.
 |          fast_tokenizer_class ([`PretrainedTokeni