# HuggingFace Transformers
This notebook will use the Hugging Face API to interact with different Transformer achitectures.



In [1]:
from transformers import pipeline

# Sentiment analysis

In [2]:
#Create instance of the sentiment analysis pipeline
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
input = ["You are such a nice person", "You are so lazy, you should really do something good for the world instead"]
classifier(input)

[{'label': 'POSITIVE', 'score': 0.9998427629470825},
 {'label': 'NEGATIVE', 'score': 0.9979380965232849}]

# Text Classification

In [4]:
#Create an instance of the classifier pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [5]:
input = "You should do one more repetition, you can do it!"
candidate_labels = ["politics", "training", "education", "cooking"]

classifier(input, candidate_labels=candidate_labels,)


{'sequence': 'You should do one more repetition, you can do it!',
 'labels': ['training', 'education', 'cooking', 'politics'],
 'scores': [0.8017725348472595,
  0.16816730797290802,
  0.02121736854314804,
  0.008842849172651768]}

In [6]:

oracle = pipeline(model="facebook/bart-large-mnli")

oracle(
    "I have a problem with my iphone that needs to be resolved asap!!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
)

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.5036354064941406,
  0.4787999987602234,
  0.012600099667906761,
  0.0026557904202491045,
  0.0023087533190846443]}

# Text generation

In [7]:
generator = pipeline("text-generation", model='gpt2')

In [8]:
prompt = "Hello, I would like to know the meaning of life. The meaning of life is"
generator(prompt, max_length=142, num_return_sequences=1)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I would like to know the meaning of life. The meaning of life is always an endless life. That is my life. The meaning of life is that in an unlimited and endless life there exists life that can only be achieved through hard work. For every man who will come back at the age of ninety-nine, every man who will come to freedom within twelve months, and every man who would come to everlasting happiness, there will exist life that no man can live without. There is a life that is eternal. That is the life that is eternal. But some must be saved and there isn't one of you who has the ability to live, for there are many who can have an infinite"}]

# Named entity recognition

In [9]:
model = pipeline("ner", grouped_entities=True)
text = 'My dog is named Pluto, and he wants to play in the park Pildammarna in Malmö'

model(text)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.87922454,
  'word': 'Pluto',
  'start': 16,
  'end': 21},
 {'entity_group': 'LOC',
  'score': 0.9412043,
  'word': 'Pildammarna',
  'start': 56,
  'end': 67},
 {'entity_group': 'LOC',
  'score': 0.9957456,
  'word': 'Malmö',
  'start': 71,
  'end': 76}]

# Question and answering

In [10]:
model = pipeline("question-answering")
context = 'I have a dog who is ten years old. I got my kitten when my dog was two. They like to play together.'
question = 'How old is my cat?'
model(question=question, context=context)


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.4931827187538147,
 'start': 20,
 'end': 33,
 'answer': 'ten years old'}

# Translation

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [38]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
#text = "The house is wonderful."
text = "I love the summer, it smells so wonderful!"
input_ids = tokenizer(f'translate English to German: {text}', return_tensors="pt").input_ids
outputs = model.generate(input_ids, max_new_tokens=142)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)

Ich liebe den Sommer, es riecht so wunderbar!


In [45]:
text = "Ich liebe den Sommer, es riecht so wunderbar!!"
input_ids = tokenizer(f'Translate to English: {text}', return_tensors="pt").input_ids
outputs = model.generate(input_ids, max_new_tokens=142)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)

Ich liebe den Sommer, es riecht so wunderbar!!


# Behind the pipeline
The pipeline does a lot of stuff for us:
- Preprocessing inputs
- Running the model
- Postprocessing output

Let's try and do the same step by step

In [55]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
raw_inputs =  [
        "I just want to be happy.",
        "I hate sallad",
    ]

#Using the pipeline
classifier = pipeline("sentiment-analysis", model=model_name)
classifier(raw_inputs)   

[{'label': 'POSITIVE', 'score': 0.9998676776885986},
 {'label': 'NEGATIVE', 'score': 0.9972905516624451}]

In [61]:
# Using the model directly and performing the steps manually
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  2074,  2215,  2000,  2022,  3407,  1012,   102],
        [  101,  1045,  5223, 16183, 27266,   102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0]])}


In [91]:
#Look at the raw tokens
input_tokens = tokenizer.tokenize(raw_inputs)
print(input_tokens)

['i', 'just', 'want', 'to', 'be', 'happy', '.', 'i', 'hate', 'sal', '##lad']


In [64]:
#This is the base model
from transformers import AutoModel
model = AutoModel.from_pretrained(model_name)
outputs = model(**inputs)

In [65]:
print(outputs.last_hidden_state.shape)

torch.Size([2, 9, 768])


In [66]:
#This is the model for sequence classification
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name)

#Get the outputs as logits
outputs = model(**inputs)

In [69]:
print(outputs.logits.shape)
outputs.logits

torch.Size([2, 2])


tensor([[-4.2892,  4.6415],
        [ 3.1981, -2.7102]], grad_fn=<AddmmBackward0>)

In [72]:
#Normalize them to probabilities
import torch.nn.functional as F
import torch
probs = F.softmax(outputs.logits, dim=-1)
print(probs.shape)

#Should be the same as the pipeline
probs

torch.Size([2, 2])


tensor([[1.3226e-04, 9.9987e-01],
        [9.9729e-01, 2.7094e-03]], grad_fn=<SoftmaxBackward0>)

In [82]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [84]:
import numpy as np
ids = np.array(torch.argmax(probs, dim=-1) )
for i, id in enumerate(ids):
    print(f"Input: {raw_inputs[i]}")
    print(f"Predicted label: {model.config.id2label[id]}")
    print(f"Predicted probability: {probs[i][id]}")
    print()

Input: I just want to be happy.
Predicted label: POSITIVE
Predicted probability: 0.9998676776885986

Input: I hate sallad
Predicted label: NEGATIVE
Predicted probability: 0.9972905516624451



# Create transformer models from a blue print
The config contain all that is needed to create a model like BERT, BART, GPT etc
We can initialize any model randomly using the config.
Then we can download a checkpoint to fill in the model weights, or train on our own like any PyTorch model

In [86]:
from transformers import BertConfig, BertModel

# Building the config
config = BertConfig()

# Building the model from the config (random initialization)
model = BertModel(config)

print(config)
print(type(model))

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.36.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

<class 'transformers.models.bert.modeling_bert.BertModel'>


In [89]:
model.save_pretrained("../models/bert-base-uncased_raw")