# Explore sentece denoising

The last cell is the one which has the most interesting results. It shows the original sentence and the other versions of the same sentence. The result it seems promissing.

In [6]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

input_sentence = "They were there to enjoy us and they were there to pray for us."

model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')



In [9]:
input_sentence = [
    "A purple elephant is flying in the sky, surrounded by pink clouds.",
    "The Eiffel Tower is standing tall in the middle of a dense forest.",
    "A group of penguins is sunbathing on a sandy beach.",
    "A rainbow-colored dinosaur is chasing a school bus on a city street.",
    "A mermaid is swimming in a fish tank filled with colorful tropical fish.",
    "A spaceship is landing on a snowy mountaintop, next to a cozy cabin.",
    "A giant cupcake is floating in the ocean, attracting seagulls.",
    "A giraffe is riding a bicycle in a crowded amusement park.",
    "A waterfall is flowing through a desert landscape with cacti and sand dunes."
]

batch = tokenizer(input_sentence, padding=True, truncation=True, return_tensors="pt").to(device)

import time
start = time.time()
translated = model.generate(**batch)
generated_sentences = tokenizer.batch_decode(translated, skip_special_tokens=True)
end = time.time()

print("Time taken in seconds: ", end - start)

print(generated_sentences)



Time taken in seconds:  4.072089195251465
['A purple elephant is flying in the sky, surrounded by pink clouds.', 'The Eiffel tower is tall in the middle of a dense forest.', 'A group of penguins is sunbathing on a sandy beach.', 'A rainbow-colored dinosaur is chasing a school bus on a city street.', 'A mermaid is swimming in a fish tank filled with colorful tropical fish.', 'A spaceship has landed on a snowy mountaintop, next to a cozy cabin.', 'A giant cupcake is floating in the ocean, attracting seagulls.', 'A giraffe is riding a bicycle in a crowded amusement park.', 'A waterfall flows through a desert landscape with cacti and sand dunes.']


#### Text-Simplification

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("philippelaban/keep_it_simple")
kis_model = AutoModelForCausalLM.from_pretrained("philippelaban/keep_it_simple")



  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 243/243 [00:00<00:00, 377kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 2.53MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 61.4MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.22MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 108/108 [00:00<00:00, 375kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 893/893 [00:00<00:00, 1.14MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.44G/1.44G [00:35<00:00, 41.2MB/s]


In [4]:
paragraph = """A small capsule containing asteroid soil samples that was dropped from 136,700 miles in space by Japan's Hayabusa2 spacecraft landed as planned in the Australian Outback on December 6. The extremely high precision required to carry out the mission thrilled many in Japan, who said they took pride in its success."""

start_id = tokenizer.bos_token_id
tokenized_paragraph = [(tokenizer.encode(text=paragraph) + [start_id])]
input_ids = torch.LongTensor(tokenized_paragraph)

output_ids = kis_model.generate(input_ids, max_length=150, num_beams=4, do_sample=True, num_return_sequences=8)
output_ids = output_ids[:, input_ids.shape[1]:]
output = tokenizer.batch_decode(output_ids)
output = [o.replace(tokenizer.eos_token, "") for o in output]

for o in output:
    print("----")
    print(o)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


----
A small capsule containing soil samples that was dropped from 136,700 miles in space by Japan's Hayabusa2 probe was successfully brought back to earth by the Outback on December 6. The precise timing of the mission thrilled many in Japan, who said they took pride in its success.
----
A small capsule containing soil samples that was dropped from 136,700 miles, Japan's Hayabusa2 space probe, landed as planned on December 6. The mission was intended to test the limits of the country's space program, said many in Japan, who said they took pride in its success.
----
A small capsule containing samples of asteroid soil that was dropped from 136,700 miles over the space of a few days earlier this year by Japan's Hayabusa2 probe. The extremely high precision required to carry out the mission thrilled many in Japan, who said they took pride in its success.
----
A small capsule containing soil samples that Japan dropped from 136,700 miles over the past two years was successfully launched by 

In [16]:
from transformers import BertTokenizerFast, EncoderDecoderModel
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizerFast.from_pretrained('mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization')
model = EncoderDecoderModel.from_pretrained('mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization').to(device)


def generate_summary(text):
    # cut off at BERT max length 512
    inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    output = model.generate(input_ids, attention_mask=attention_mask, length_penalty=100.0, num_beams=4)

    return tokenizer.decode(output[0], skip_special_tokens=True)
  


In [17]:
text = "A purple elephant is flying in the sky, surrounded by pink clouds."
generate_summary(text)

'a purple elephant is flying in the sky, surrounded by pink clouds. the elephant is surrounded by the pink clouds and is flying into the sky. it is also surrounded by yellow clouds, with pink clouds flying in sky. the elephants are flying through the sky for the first time in the world.'

In [10]:
from transformers import DistilBertTokenizer, DistilBertModel, GPT2Model
import torch


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased')

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
outputs = model(input_ids)
last_hidden_states = outputs[0]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='distilgpt2')
set_seed(42)
generator("could you rephrase A purple elephant is flying in the sky, surrounded by pink clouds.", max_length=30, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'could you rephrase A purple elephant is flying in the sky, surrounded by pink clouds.'},
 {'generated_text': 'could you rephrase A purple elephant is flying in the sky, surrounded by pink clouds.\n\n\nBut you can never know the real name of'},
 {'generated_text': 'could you rephrase A purple elephant is flying in the sky, surrounded by pink clouds.\n\nYou must get in there and catch up (like'},
 {'generated_text': 'could you rephrase A purple elephant is flying in the sky, surrounded by pink clouds.\nAs one can imagine, you could not walk, a'},
 {'generated_text': 'could you rephrase A purple elephant is flying in the sky, surrounded by pink clouds.'}]

# With this model we can do rephrasing of sentences

The main idea is that we rephrase the sentences reducing the perplexity of the sentence. In this way we make easier to CLIP to understand which is the box which best match.

In [18]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
checkpoint="unikei/t5-base-split-and-rephrase"


tokenizer = T5Tokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

complex_sentence = "A purple elephant surrounded by pink clouds is fly in the sky."
complex_tokenized = tokenizer(complex_sentence, 
                                 padding="max_length", 
                                 truncation=True,
                                 max_length=256, 
                                 return_tensors='pt')

simple_tokenized = model.generate(complex_tokenized['input_ids'], attention_mask = complex_tokenized['attention_mask'], max_length=256, num_beams=5, num_return_sequences=5)
simple_sentences = tokenizer.batch_decode(simple_tokenized, skip_special_tokens=True)
print(simple_sentences)


['A purple elephant is seen in the sky. Pink clouds surround the purple elephant.', 'A purple elephant is in the sky. Pink clouds surround the purple elephant.', 'A purple elephant is in the sky. Pink clouds surround a purple elephant.', 'A purple elephant is surrounded by pink clouds. A purple elephant is in the sky.', 'A purple elephant is seen in the sky. Pink clouds surround the elephant.']
