In [1]:
from pathlib import Path
import os

ROOT = Path("/Users/Admin/Desktop/PARA-OS/01_Projects/Learning-Design-ML-Systems/Data-augmentation-DMLS")

if ROOT.is_dir():
    print(f"ROOT directory exists hence changing directory to '{ROOT.name}'")
    os.chdir(ROOT)
else:
    raise NotADirectoryError

ROOT directory exists hence changing directory to 'Data-augmentation-DMLS'


References - 
- [1] NLPAUG - https://pypi.org/project/nlpaug/
- [2] NLPAUG Cookbook - https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb

In [2]:
import nltk # noqa
from nltk.corpus import wordnet # noqa

def replace_synonyms(sentence):
    words = nltk.word_tokenize(sentence)
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms and synonyms[0].lemmas():
            new_words.append(synonyms[0].lemmas()[0].name())
        else:
            new_words.append(word)
    return " ".join(new_words)

sentence = "The quick brown fox jumps over the lazy dog"
new_sentence = replace_synonyms(sentence)
print(new_sentence)

The quick brown fox jump over the lazy dog


In [21]:
wordnet.synsets("jump")

[Synset('jump.n.01'),
 Synset('leap.n.02'),
 Synset('jump.n.03'),
 Synset('startle.n.01'),
 Synset('jump.n.05'),
 Synset('jump.n.06'),
 Synset('jump.v.01'),
 Synset('startle.v.02'),
 Synset('jump.v.03'),
 Synset('jump.v.04'),
 Synset('leap_out.v.01'),
 Synset('jump.v.06'),
 Synset('rise.v.11'),
 Synset('jump.v.08'),
 Synset('derail.v.02'),
 Synset('chute.v.01'),
 Synset('jump.v.11'),
 Synset('jumpstart.v.01'),
 Synset('jump.v.13'),
 Synset('leap.v.02'),
 Synset('alternate.v.01')]

In [22]:
wordnet.synsets("jump")[0].lemmas()[0].name()

'jump'

In [3]:
import nlpaug.augmenter.word as naw # noqa

text = 'The quick brown fox jumps over the lazy dog .'
print(text)

The quick brown fox jumps over the lazy dog .


In [4]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brownness fox jump terminated the lazy dog.']


In [5]:
aug = naw.SynonymAug(aug_src='wordnet')
print("Original:")
print(text)
for idx in range(1,6):
    print()
    augmented_text = aug.augment(text)
    print(f'Augmented Text Sample {idx:02d}:-')
    print(augmented_text[0])
    print("----")

Original:
The quick brown fox jumps over the lazy dog .

Augmented Text Sample 01:-
The ready brown dodger jumps ended the lazy dog.
----

Augmented Text Sample 02:-
The speedy brown charles james fox jumps terminated the lazy dog.
----

Augmented Text Sample 03:-
The agile brown fox jumps ended the lazy frank.
----

Augmented Text Sample 04:-
The quick brown dodger leap over the lazy hot dog.
----

Augmented Text Sample 05:-
The straightaway brown fox skip over the work shy dog.
----


### by using transformers

In [6]:
from transformers import pipeline # noqa

# Initialize the fill-mask pipeline
fill_mask = pipeline(
    "fill-mask",
    model="distilroberta-base",
    tokenizer="distilroberta-base"
)

def replace_word_with_masked_token(sentence, word_to_replace):
    # Replace the word with the masked token
    masked_sentence = sentence.replace(word_to_replace, fill_mask.tokenizer.mask_token)
    
    # Use the fill-mask pipeline to predict the masked token
    predictions = fill_mask(masked_sentence)
    
    # Replace the masked token with the top prediction
    augmented_sentence = masked_sentence.replace(fill_mask.tokenizer.mask_token, predictions[0]["token_str"])
    
    return augmented_sentence

# Example usage
sentence = "The quick brown fox jumps over the lazy dog"
word_to_replace = "quick"
print(replace_word_with_masked_token(sentence, word_to_replace))


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The  fluffy brown fox jumps over the lazy dog


In [7]:
"quick brown fox jumps over the lazy dog".split()

['quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [8]:
print("Original:")
print(text)
words_to_replace = ['quick', 'brown', 'jumps', 'lazy']

for idx, word_to_replace in enumerate(words_to_replace):
    print()
    print(f'Augmented Text Sample by replacing: "{word_to_replace}":-')
    print(replace_word_with_masked_token(sentence, word_to_replace))

Original:
The quick brown fox jumps over the lazy dog .

Augmented Text Sample by replacing: "quick":-
The  fluffy brown fox jumps over the lazy dog

Augmented Text Sample by replacing: "brown":-
The quick  eyed fox jumps over the lazy dog

Augmented Text Sample by replacing: "jumps":-
The quick brown fox  wins over the lazy dog

Augmented Text Sample by replacing: "lazy":-
The quick brown fox jumps over the  barking dog
