In [None]:
!pip install transformers nltk torch torchvision torchaudio conllu nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from typing import *
import re
from pprint import pprint

split_sentences_broad_regex = re.compile(r"(?:\n(?:[\r\t\f\v ]*)\n)+")
split_tokens_broad_regex = re.compile(r"\s+")
sentence_end_broad_regex = re.compile(r"((?:\?|!)+(?:[\"\']*)$)")

sentence_end_period_regex = re.compile(r"(\.(?:[\"\']*)$)")

abbreviation_regex = re.compile(r"(?:(?:[a-zA-Z]+\.){2,})|(?:[A-Z]\.)")

punctuation_split_regex = re.compile(r"([,\"(){}\[\]_;:*\/—]+)|(-(?:-+))")

url_regex = re.compile(
    r"(http[s]?:\/\/(www\.)?|ftp:\/\/(www\.)?|www\.){1}([0-9A-Za-z-\.@:%_\+~#=]+)+((\.[a-zA-Z]{2,3})+)(\/(.)*)?(\?(.)*)?")
hashtag_regex = re.compile(r"#(?:[a-zA-Z0-9]+)")
mention_regex = re.compile(r"@(?:[a-zA-Z0-9]+)")
num_regex = re.compile(r"([0-9]+,)*[0-9]+(\.([0-9]*))?(k|m|b|t)", re.IGNORECASE)

remove_regex = re.compile(r"(?:[*\"_{}\[\]]+)")
remove_with_dash_regex = re.compile(r"^[-*\"_{}\[\]]+$")

token_substitute = {
    url_regex: "<URL>",
    hashtag_regex: "<HASHTAG>",
    mention_regex: "<MENTION>"
}


def is_abbreviation(token: str) -> bool:
    if abbreviation_regex.search(token):
        return True

    if token.lower().replace("\"", "").replace("\'", "") in {"dr.", "mr.", "mrs."}:
        return True

    return False


def is_num(token: str) -> bool:
    return bool(num_regex.search(token))


def tokenize_english(text: str) -> List[List[str]]:
    sentences: List[str] = split_sentences_broad_regex.split(text)
    sentences = [sentence for sentence in sentences if sentence]
    sentences_tokens: List[List[str]] = [split_tokens_broad_regex.split(sentence) for sentence in sentences]
    parsed_sentence_tokens = []
    for sentence in sentences_tokens:
        tokens = []
        for token in sentence:
            for regex, sub in token_substitute.items():
                if regex.search(token):
                    token = regex.sub(token, sub)
                    break
            if punctuation_split_regex.search(token):
                tokens.extend(punctuation_split_regex.split(token))
                continue

            tokens.append(token)
        tokens = [token for token in tokens if token]
        if tokens:
            parsed_sentence_tokens.append(tokens)

    sentences_tokens = parsed_sentence_tokens
    parsed_sentence_tokens = []
    for sentence in sentences_tokens:
        tokens = []
        for token in sentence:

            if sentence_end_broad_regex.search(token):
                tokens.extend(sentence_end_broad_regex.split(token))
                tokens = [token for token in tokens if token]
                parsed_sentence_tokens.append(tokens)
                tokens = []
                continue

            if sentence_end_period_regex.search(token) and not is_abbreviation(token):
                tokens.extend(sentence_end_period_regex.split(token))
                tokens = [token for token in tokens if token]
                parsed_sentence_tokens.append(tokens)
                tokens = []
                continue

            tokens.append(token)
        tokens = [token for token in tokens if token]
        if tokens:
            parsed_sentence_tokens.append(tokens)
    parsed_sentence_tokens = [
        [remove_regex.sub(token, "") if remove_regex.search(token) else token for token in sentence] for sentence in
        parsed_sentence_tokens]

    parsed_sentence_tokens = [
        [remove_with_dash_regex.sub(token, "") if remove_with_dash_regex.search(token) else token for token in sentence]
        for sentence in
        parsed_sentence_tokens]

    parsed_sentence_tokens = [[token.lower() for token in sentence if token] for sentence in parsed_sentence_tokens]
    parsed_sentence_tokens = [sentence for sentence in parsed_sentence_tokens if sentence]
    accumulate = []
    for s in parsed_sentence_tokens:
      accumulate.extend(s)
    return accumulate

tokenize_english("greetings")

['greetings']

In [None]:
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7ff1ef0cea90>

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
print(len(tokenizer))
tokenizer.add_special_tokens({"sep_token": "<|sep|>", "bos_token":'<|startoftext|>',
                                           "pad_token":'<|pad|>'})
print(tokenizer.all_special_tokens)
model.resize_token_embeddings(len(tokenizer))

50257
['<|startoftext|>', '<|endoftext|>', '<|sep|>', '<|pad|>']


Embedding(50260, 1024)

In [None]:
from typing import Sequence, List
from random import shuffle, randint, choice
from conllu import parse
from nltk.corpus import wordnet

stop_words = {
    "the",
    "a",
    "an",
    "and",
    "for",
    "nor",
    "neither",
    "or",
    "yet",
    "so",
    "either",
    "as",
    "but",
    "if",
    "then",
    "well",
    "however",
    "thus",
    "would",
    "could",
    "should",
    "may",
    "oh",
    "ah",
    "also"
    "is",
    "by",
    "with"
}

def get_synonyms(word: str):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            if l.name() != word:
              synonyms.append(l.name())
    return synonyms

def swap_random(a: List, exclude: List):
    if len(a) <= 1:
        return
    for _ in range(5):

        i, j = randint(0, len(a) - 1), randint(0, len(a) - 1)

        if a[i] in exclude or a[j] in exclude:
            continue
        a[i], a[j] = a[j], a[i]
        break

def corrupt(tokens: Sequence[str]) -> Sequence[str]:
    all_tokens = tokens
    tokens = [token for token in tokens if token not in stop_words]
    if len(tokens) == 0:
        tokens = all_tokens # if all words are stop words restore
    i = randint(1, len(tokens)//3+1)
    for _ in range(i):
        swap_random(tokens, {"!", "?", "."})
    for i, token in enumerate(tokens):
        if len(wordnet.synsets(token)) < 2:
            synonyms = get_synonyms(token)
            tokens[i] = choice(synonyms + [token])
    return tokens



def load_data():
    with open("en_ewt-ud-train.conllu", encoding="utf-8") as f:
        text = f.read()
    sentences = [tokenize_english(sentence.metadata.get("text")) for sentence in parse(text)][::2]
    return sentences



In [None]:
from copy import deepcopy

IGNORE_INDEX = -100

class ReconstructionDataset:
  def __init__(self, max_length=100):
    sentences = load_data()
    self.input_ids = []
    self.attn_masks = []
    self.labels = []
    for sentence in sentences:
        corrupted = corrupt(sentence)
        print(sentence)
        print(corrupted)
        print()
        sentence_ignore_len = len(tokenizer('<|startoftext|>' + " ".join(corrupted) + "<|sep|>")["input_ids"])
        encodings_dict = tokenizer('<|startoftext|>' + " ".join(corrupted) + "<|sep|>" + " ".join(sentence) + '<|endoftext|>', truncation=True,
                                    max_length=max_length, padding="max_length", add_special_tokens=True)
        input_ids = torch.tensor(encodings_dict['input_ids'])
        self.input_ids.append(input_ids)
        label = deepcopy(input_ids)
        label[:sentence_ignore_len] = IGNORE_INDEX
        self.labels.append(label)

  def __len__(self):
      return len(self.input_ids)

  def __getitem__(self, idx):
      return self.input_ids[idx], self.labels[idx]

In [None]:
dataset = ReconstructionDataset()
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['700f', 'jobs', 'advertised', "aren't", '.']

['why', 'does', 'my', 'baby', 'king', 'snake', 'refuse', 'to', 'eat', '?']
['wherefore', 'king', 'snake', 'baby', 'does', 'my', 'refuse', 'to', 'eat', '?']

['i', 'have', 'tried', 'everything', 'i', 'can', 'to', 'get', 'him', 'to', 'eat', 'but', 'he', 'refuses', '.']
['i', 'have', 'tried', 'he', 'i', 'can', 'to', 'get', 'him', 'to', 'eat', 'everything', 'refuses', '.']

['as', 'far', 'as', 'i', 'know', ',', 'everything', 'is', 'as', 'good', 'as', 'it', 'can', 'be', 'where', 'his', 'aquarium', 'is', 'concerned', '.']
['far', 'i', 'know', ',', 'good', 'is', 'marine_museum', 'IT', 'where', 'be', 'is', 'his', 'everything', 'can', 'concerned', '.']

['he', 'has', 'been', 'going', 'to', 'the', 'bathroom', 'like', 'he', 'should', '.']
['to', 'he', 'been', 'going', 'he', 'has', 'bathroom', 'like', '.']

['he', 'has', 'been', 'drinking', 'water', '(', 'i', 'am', 'pretty', 'sure', 'he'

In [None]:
import gc
gc.collect()

8

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=20, per_device_eval_batch_size=10,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                'labels': torch.stack([f[1] for f in data])}).train()



Step,Training Loss
100,1.6698
200,0.3227
300,0.2778
400,0.229
500,0.2176
600,0.1961
700,0.165
800,0.173


In [None]:
def print_paraphrases(sentence):
    corrupted = " ".join(corrupt(tokenize_english(sentence)))
    corrupt_in = "<|startoftext|>" + corrupted + "<|sep|>"
    generated = tokenizer(corrupt_in, return_tensors="pt").input_ids.cuda()
    sample_outputs = model.generate(generated, do_sample=True, top_k=50,
                                    max_length=300, top_p=0.95, temperature=2., num_return_sequences=20)
    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}".format(i, tokenizer.decode(sample_output).split("<|sep|>")[1].split("<|endoftext|>")[0]))

In [None]:
print_paraphrases("Can you paraphrase this sentence.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  this can paraphrasing sentence this sentence.
1:  phantom sentence : this can read as paraphrase.
2:  try this sentence or paraphrase thephrase.
3:  this then sentence can be translated as well.
4:  if this may you may as you paraphrase.
5:  this can you paraphrase and may you can paraphrase it for $50.
6:  this sentence will help you paraphrase if you could quote.
7:  this is what this can or probably i should paraphrase..
8:  if you can you can do the sentence paraphrase.
9:  so this sentences can would you summarise.
10:  i could sentence this as you can.
11:  oh yeah this can and can you paraphrase a well.
12:  pharmacy can you paraphrase these.
13:  do you realize what a sentence can possibly quote?
14:  this can also you remove any paribraphically, as follows and by writing your pariah sentence.
15:  in addition. well you could sentence and paraphrase this.
16:  if you can paraphrase sentence.
17:  this sentence can be paraphrased to make it look more simple.
18:  this senten

In [None]:
model.save_pretrained("paraphraser")

In [None]:
model_loaded = GPT2LMHeadModel.from_pretrained('paraphraser').cuda()

In [None]:
def print_paraphrases_loaded(sentence):
    corrupted = " ".join(corrupt(tokenize_english(sentence)))
    corrupt_in = "<|startoftext|>" + corrupted + "<|sep|>"
    generated = tokenizer(corrupt_in, return_tensors="pt").input_ids.cuda()
    sample_outputs = model_loaded.generate(generated, do_sample=True, top_k=50,
                                    max_length=300, top_p=0.95, temperature=2., num_return_sequences=20)
    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}".format(i, tokenizer.decode(sample_output).split("<|sep|>")[1].split("<|endoftext|>")[0]))

In [None]:
print_paraphrases_loaded("hello i am a loaded model")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  came in hifi and is loaded i am am the best model the
1:  am hi im loaded as Model
2:  hi i am loaded am the model
3:  for the model it loaded hi am hiked hi
4:  how do i load hi model
5:  hi im loaded and he models awesome
6:  hi am load was loaded with a model
7:  hi i am preloaded Model hi
8:  hi ive added another model and you load me hi am
9:  hi my model loaded up
10:  hi i am loaded model
11:  i am loaded but its a hi am model
12:  i am loaded and humungous i am happy
13:  i am loading a model gun im hi
14:  me and i'm loading the model
15:  hi i am loading a model
16:  am load n pre i model ah
17:  loaded with the car i am a model
18:  i am set Hi Hi auto loader
19:  i am ho ho cocked with a model and am loaded with hi p


NotImplementedError: ignored

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!zip -r paraphraser.zip ./paraphraser

  adding: paraphraser/ (stored 0%)
  adding: paraphraser/generation_config.json (deflated 24%)
  adding: paraphraser/pytorch_model.bin (deflated 9%)
  adding: paraphraser/config.json (deflated 52%)
