# Getting familiar with using T5
Text-to-text transfer transformer.
  * https://github.com/google-research/text-to-text-transfer-transformer
  * https://huggingface.co/docs/transformers/index
  


In [1]:
#@title Import libraries and load models.
!pip install transformers
!pip install sentencepiece

from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/7.2 MB[0m [31m19.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m5.4/7.2 MB[0m [31m70.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m74.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m74.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [24]:
#@title Try out text summarization.
def summarize(text):
  inputs = tokenizer.encode(
      "summarize: " + text,
      return_tensors='pt',
      max_length=512,
      truncation=True)
  summarization_ids = model.generate(inputs, max_length=200, min_length=100, length_penalty=2., num_beams=4)
  return tokenizer.decode(summarization_ids[0])

movie_review = ("""
Review of The Dark Knight Rises. Christopher Nolan brings yet another adrenaline-filled, comic-inspired movie to the big screen.
We see all sorts of familiar faces this time around, but the audience is introduced to a few new characters as well.
When crisis threatens Gotham City, Bruce Wayne jumps back into the Batmobile to fight crime. Batman is joined on his quest by
an eager orphaned cop (Joseph Gordon-Levitt), a seductive cat burglar (Anne Hathaway), and a violent masked villain (Tom Hardy).
This film served as great entertainment with its colorful cast and numerous plot twists. Nolan used actors that had either
appeared in previous Batman films or in his blockbuster hit Inception, and all of them shone in their respective roles:
Tom Hardy was almost unrecognizable in his Bane costume, while Joseph Gordon-Levitt and Marion Cotillard were both
excellent—and obviously comfortable with Nolan’s directing style and the film’s dramatic tone.
The one actor that gave this reviewer pause was Anne Hathaway as Selina Kyle. She has historically been typecast as the
girl next door, so it was a shock to watch her steal and fight her way through the City of Gotham. After a few scenes,
however, we were convinced that the casting decisions was a good one, as Hathaway portrayed the darker Catwoman role brilliantly.
True to Nolan’s style, at 164 minutes, this film is fairly long. There were a few times when the movie felt a bit drawn out,
but the gorgeous action scenes and impressive dialogue really held the audience’s attention and kept them on the edge of
their seats. However, the timeline was a bit unclear at times. For a number of scenes, it was hard to tell whether it had been
days or months or years that had passed since the last time a given character had been on screen.
""")
summarize(movie_review)

'<pad> Christopher Nolan brings another adrenaline-filled, comic-inspired movie to the big screen. the film served as great entertainment with its colorful cast and numerous plot twists. nolan used actors that had either appeared in previous Batman films or in his blockbuster hit Inception. the one actor that gave this reviewer pause was Anne Hathaway as Selina Kyle, who has historically been typecast as the girl next door, so it was a shock to watch her steal and fight her way through the City of Gotham </s>'

In [7]:
#@title Language translation.
language_sequence = ("You should definitely watch 'One Piece', it is so good, you will love the comic book")
input_ids = tokenizer("translate English to French: "+language_sequence, return_tensors="pt").input_ids
language_ids = model.generate(input_ids)
language_translation = tokenizer.decode(language_ids[0], skip_special_tokens=True)
language_translation



"Vous devriez regarder 'One Piece', c'est si bon"

In [13]:
#@title Other processors: entailment (MNLI)

# https://huggingface.co/docs/transformers/main_classes/processors

def mnli(entailment_premise, entailment_hypothesis):
  input_ids = tokenizer(
    f"mnli premise: {entailment_premise} hypothesis: {entailment_hypothesis}",
    return_tensors="pt").input_ids
  entailment_ids = model.generate(input_ids)
  return tokenizer.decode(entailment_ids[0],skip_special_tokens=True)

mnli("I love One Piece.", "My feelings towards One Piece is filled with love")



'entailment'

In [14]:
mnli("I love One Piece.", "This is a random unrelated sentence.")

'contradiction'

In [17]:
#@title Corpus of linguistic acceptability (CoLA)

def cola(sentence):
  input_ids = tokenizer("cola: "+ sentence, return_tensors="pt").input_ids
  sentence_ids = model.generate(input_ids)
  return tokenizer.decode(sentence_ids[0],skip_special_tokens=True)

cola("Luffy is a great pirate.")


'acceptable'

In [22]:
#@title Sentence similarity processor (stsb).
def stsb(stsb_sentence_1, stsb_sentence_2):
  input_ids = tokenizer("stsb sentence 1: "+stsb_sentence_1+" sentence 2: "+stsb_sentence_2, return_tensors="pt").input_ids
  stsb_ids = model.generate(input_ids)
  return tokenizer.decode(stsb_ids[0],skip_special_tokens=True)

stsb("Luffy was fighting in the war.", "Luffy's fighting style is comical.")

'4.0'

In [26]:
#@title Sentiment (SST-2)

def sst2(text):
  input_ids = tokenizer(
      f"sst2: {text}",
      return_tensors="pt").input_ids
  sst2_ids = model.generate(input_ids)
  return tokenizer.decode(sst2_ids[0], skip_special_tokens=True)

sst2("This is a disappointing and boring book.")

'positive'