In [1]:
# Install dependencies

# %pip install transformers datasets tokenizers torch sacremoses

# Inputs: Tokenizing

The first step towards using NLP transformer models is to tokenize the inputs.

Each model is paired with a tokenizer model. They go hand-in-hand because the initial embeddings are tied to the vocabulary supported by the tokenizer.

Fortunately, we can fetch the appropriate tokenizer with the model name

In [2]:
# We leverage the auto classes heavily
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Input sequence
input_sequence = "Hello, how are you today?"

# Encode the input sequence
token_ids = tokenizer.encode(input_sequence, add_special_tokens=True)

# Print the token IDs
print("Token ids:", token_ids)

# Print the tokens
print("Tokens:", tokenizer.convert_ids_to_tokens(token_ids))

# Print the tokens
print("Decoded input:", tokenizer.decode(token_ids))


  from .autonotebook import tqdm as notebook_tqdm


Token ids: [101, 7592, 1010, 2129, 2024, 2017, 2651, 1029, 102]
Tokens: ['[CLS]', 'hello', ',', 'how', 'are', 'you', 'today', '?', '[SEP]']
Decoded input: [CLS] hello, how are you today? [SEP]


In [3]:
# Try another sequence
input_sequence = "The One Ring, forged in the fires of Mount Doom, holds immense power and corrupts those who wield it, leading to the downfall of many kingdoms."
# Tokenize the input sequence
tokens = tokenizer.tokenize(input_sequence)
print("Tokens:", tokens)

# Convert tokens to token IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", input_ids)

Tokens: ['the', 'one', 'ring', ',', 'forged', 'in', 'the', 'fires', 'of', 'mount', 'doom', ',', 'holds', 'immense', 'power', 'and', 'corrupt', '##s', 'those', 'who', 'wi', '##eld', 'it', ',', 'leading', 'to', 'the', 'downfall', 'of', 'many', 'kingdoms', '.']
Token IDs: [1996, 2028, 3614, 1010, 16158, 1999, 1996, 8769, 1997, 4057, 12677, 1010, 4324, 14269, 2373, 1998, 13593, 2015, 2216, 2040, 15536, 14273, 2009, 1010, 2877, 2000, 1996, 22252, 1997, 2116, 12028, 1012]


Transformers usually require more than just the token identifiers. The tokenizer will help you marshall the data appropriately

In [4]:
# Prepare the input for the transformer
from pprint import pprint
input = tokenizer("I'm so excited about my new job!")
pprint(input)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
               1045,
               1005,
               1049,
               2061,
               7568,
               2055,
               2026,
               2047,
               3105,
               999,
               102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


# Encoding the input with a model

The next step is actually running a model with the input produced by a tokenizer.

Let's try a sentiment classification model. It will predict whether a phrase's sentiment is either `positive`, `neutral` or `negative`.

In [5]:
from transformers import AutoModelForSequenceClassification, AutoConfig

# Model name from HuggingFace's Hub
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# See the configuration
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenize the input
# text = "I'm so excited about my new job!"
# text = "I'm feeling really down today."
text = "I'm going to the store."
input = tokenizer(text, return_tensors="pt")

# See how roberta tokenizer doesn't add the token types
pprint(input)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[   0,  100,  437,  164,    7,    5, 1400,    4,    2]])}


Run the transformer

In [6]:
import numpy as np
from scipy.special import softmax

# Run the transformer
output = model(**input)
pprint(output)

# Normalize them to represent probabilities
scores = output[0][0].detach().numpy()
scores = softmax(scores)

print("Probabilities:", scores)

SequenceClassifierOutput(loss=None,
                         logits=tensor([[-1.9199,  1.5211,  0.1629]], grad_fn=<AddmmBackward0>),
                         hidden_states=None,
                         attentions=None)
Probabilities: [0.02484636 0.7757175  0.19943619]


How to go from probabilities to chosing a predicted label

In [7]:
# Make sense of the output
ranking = np.argsort(scores)
ranking = ranking[::-1]   # Reverse the elements
# Print each class' label and score
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) neutral 0.7757
2) positive 0.1994
3) negative 0.0248


# Streamline using pipelines

Using transformers can be a bit involved. Pipelines allow us to streamline the interface by abstracting away the details of each "task"

In [8]:
import torch
import itertools as it
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load the pre-trained transformer model for sequence classification
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for text classification
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Examples of spam and not spam texts
positive_tweets = [
    "I'm so excited about my new job!",
    "This is the best pizza I've ever had!",
    "I love spending time with my family.",
    "The weather is beautiful today.",
    "I just got a promotion!"
]

negative_tweets = [
    "I'm so tired and stressed.",
    "I hate traffic jams.",
    "My computer is broken.",
    "I lost my keys.",
    "I'm feeling really down today."
]

neutral_tweets = [
    "It's raining outside.",
    "I'm watching a movie.",
    "I just ate lunch.",
    "I'm going to the store.",
    "I'm listening to music."
]

# Classify the texts
for text in it.chain(positive_tweets, negative_tweets, neutral_tweets):
    result = classifier(text)
    print(f"Text: {text}")
    print(f"Label: {result[0]['label']}, Score: {result[0]['score']}\n")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


Text: I'm so excited about my new job!
Label: positive, Score: 0.9876318573951721

Text: This is the best pizza I've ever had!
Label: positive, Score: 0.9853105545043945

Text: I love spending time with my family.
Label: positive, Score: 0.9865761995315552

Text: The weather is beautiful today.
Label: positive, Score: 0.9844391942024231

Text: I just got a promotion!
Label: positive, Score: 0.9725865721702576

Text: I'm so tired and stressed.
Label: negative, Score: 0.8532361388206482

Text: I hate traffic jams.
Label: negative, Score: 0.9240683913230896

Text: My computer is broken.
Label: negative, Score: 0.8859074711799622

Text: I lost my keys.
Label: negative, Score: 0.6730406880378723

Text: I'm feeling really down today.
Label: negative, Score: 0.9055712223052979

Text: It's raining outside.
Label: neutral, Score: 0.6317098140716553

Text: I'm watching a movie.
Label: neutral, Score: 0.749224841594696

Text: I just ate lunch.
Label: neutral, Score: 0.5950841903686523

Text: I'm 

# Batching by using a GPU

Leverage GPU computing throughput through batching. Supported by pipelines.

In [10]:
import random

batch = positive_tweets + negative_tweets + neutral_tweets
random.shuffle(batch)

# Create a pipeline for text classification, now in the GPU
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)

results = classifier(batch)

for input, output in zip(batch, results):
  print("Input:", input)
  print("Output:", output)
  print()

Device set to use mps:0


Input: I'm going to the store.
Output: {'label': 'neutral', 'score': 0.7757179141044617}

Input: I'm so excited about my new job!
Output: {'label': 'positive', 'score': 0.9876318573951721}

Input: I hate traffic jams.
Output: {'label': 'negative', 'score': 0.9240683913230896}

Input: This is the best pizza I've ever had!
Output: {'label': 'positive', 'score': 0.9853105545043945}

Input: I lost my keys.
Output: {'label': 'negative', 'score': 0.6730406880378723}

Input: It's raining outside.
Output: {'label': 'neutral', 'score': 0.6317098140716553}

Input: My computer is broken.
Output: {'label': 'negative', 'score': 0.8859074711799622}

Input: The weather is beautiful today.
Output: {'label': 'positive', 'score': 0.9844391942024231}

Input: I just ate lunch.
Output: {'label': 'neutral', 'score': 0.5950841903686523}

Input: I love spending time with my family.
Output: {'label': 'positive', 'score': 0.9865761995315552}

Input: I'm feeling really down today.
Output: {'label': 'negative', '

# Other tasks

There are many different tasks available through transformers. Here are some examples.

## Named Entity Recognition

Word-level classification

In [14]:
model_name = "elastic/distilbert-base-uncased-finetuned-conll03-english"
classifier = pipeline("ner", model=model_name, device="mps")
outputs = classifier([
    "The Data Science Institute is holding AI-related workshops for the university community.",
    "Enrique and Carlos are hosting a workshop about HuggingFace on September"
    ])

for output in outputs:
  words = []
  p_entity = None
  for elem in output:
    type_, entity = elem['entity'].split('-')
    if type_ == "B":
      print(p_entity, " ".join(words))
      words = []
      p_entity = None
    words.append(elem['word'])
    p_entity = entity
  print(p_entity, "".join(words))
  print()

KeyboardInterrupt: 

## Machine translation
Sequence to sequence generation

In [None]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en", device="cuda")

text1 = "Me gusta aprender sobre inteligencia artificial."
text2 = "Tengo ganas de comer una hamburguesa."

text = [text1, text2]
translated = translator(text)

pprint(translated)

# Summarization
Sequence to sequence generation

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device="cuda")

ARTICLE = """ New York, NY – A fierce competition is brewing among tech giants as they vie to lead the burgeoning artificial intelligence (AI) market. Companies like Google, Microsoft, Meta, and Amazon are investing heavily in research and development, aiming to create AI systems that can revolutionize industries from healthcare to transportation.
AI has the potential to transform society in unprecedented ways. From developing new medical treatments to automating complex tasks, AI applications are becoming increasingly sophisticated. However, the rapid advancement of AI also raises concerns about ethical implications, job displacement, and the potential for misuse.
Google, a pioneer in AI research, has made significant strides in recent years. Its AI-powered search engine and Google Assistant have become ubiquitous in everyday life. Microsoft, not to be outdone, has invested heavily in AI through its acquisition of OpenAI, the company behind the popular language model GPT-3.
Meta, formerly known as Facebook, is also making significant investments in AI. The company's AI research focuses on areas such as natural language processing, computer vision, and recommendation systems. Amazon, while primarily known for its e-commerce platform, has been leveraging AI to improve its customer experience and optimize its logistics operations.
As the competition intensifies, the tech giants are facing increasing pressure to ensure that their AI systems are developed and deployed responsibly. Concerns about bias, privacy, and the potential for AI to be used for malicious purposes are becoming more prominent.
The future of AI is uncertain, but one thing is clear: the race to dominate this emerging market is far from over. The tech giants that are able to successfully navigate the challenges and opportunities presented by AI will likely reap significant rewards for years to come.
"""
pprint(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))


## Extractive question answering
Span prediction

In [None]:
model_name = "deepset/roberta-base-squad2"

oracle = pipeline('question-answering', model=model_name, device="cuda")

res = oracle({
    'question': "What is Neo's goal?",
    'context': """The Matrix is a science fiction film set in a dystopian future where humanity is unknowingly trapped in a simulated reality created by intelligent machines.
    The machines use humans as a power source and keep them unaware of their true situation.The story follows Neo, a computer hacker who is chosen to free humanity from this simulated world, known as the Matrix.
    He is guided by a group of rebels who believe they can overthrow the machines and liberate humanity."""
})

pprint(res)

# Composing pipelines

Given that pipelines are high level abstractions, we can compose them into more complex, multi-step pipelines fairly easily.

In [None]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es", device="cuda")

def combined_pipeline(text):
  summary = summarizer(text, max_length=60, min_length=30, do_sample=False)
  translation = translator(summary[0]['summary_text'])
  return translation[0]['translation_text']

combined_pipeline(ARTICLE)