# Notebook to test augmentation strategies

In [1]:
%load_ext autoreload
%autoreload 2

## Load resources

In [2]:
from data import load_data

train, val, test = load_data()

In [36]:
full = train.append(val)

## Download augmentation models

In [11]:
from nlpaug.util.file.download import DownloadUtil
aug_models_dir = './aug_model'
DownloadUtil.download_word2vec(dest_dir=aug_models_dir) # Download word2vec model
DownloadUtil.download_glove(model_name='glove.6B', dest_dir=aug_models_dir) # Download GloVe model
DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir=aug_models_dir) # Download fasttext model

## Try augmentation strategies

In [12]:
import nlpaug.augmenter.word as naw

In [13]:
import os
os.environ["MODEL_DIR"] = './aug_model'

In [14]:
text = train["description"][0]
text

'jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.'

Word insertion by embeddings

In [19]:
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='fasttext', model_path=aug_models_dir+'/wiki-news-300d-1M.vec',
    action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.
Augmented Text:
jacket made of Orts a Unencrypted technical fabric Landrecies with texture. high collar and SafeSurf long sleeves. front pockets. Nelo ribbed trims. 1947-1960 zip - Responsa up front.


Word substitution by embeddings

In [20]:
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='fasttext', model_path=aug_models_dir+'/wiki-news-300d-1M.vec',
    action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.
Augmented Text:
jacket conceded of a technical fabric with texture. deep collar and over-due sleeves. inside pockets. ornamented trims. Zipping - up end.


TF-IDF augmenter (TODO: requires training model)

In [40]:
import re
import nlpaug.model.word_stats as nmw

def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

# Tokenize input
train_x_tokens = [_tokenizer(x) for x in full["description"]]

# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save(os.environ.get("MODEL_DIR"))

In [41]:
aug = naw.TfIdfAug(
    model_path=os.environ.get("MODEL_DIR"),
    action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.
Augmented Text:
mop made strewn a technical fabric with imitation. high collar 12m long sleeves. escaping teething. knots trims. zip - up front.


Language model substitution

In [25]:
aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
augmented_text = aug.augment(text, n=10)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.
Augmented Text:
['jacket cut to a technical mach<unk> texture. high collar. long sleeves. front plates. elbow trims. zip-up pockets.', 'jacket comprised of poly technical fabric jersey texture. high knee and tiny skirt. front collar. ribbed belt. zip-up front.', 'jacket made of soft technical fabric with texture. high neck and hip support. minimal zipper. ribbed trims. cut-up pouch.', 'jacket made of a technical fabric with buttons. shoulder boots x long trousers. hip pockets. ribbed shoulders. fold-up front.', 'jacket cut of a lightweight fabric for piping. high collar and long sleeves. tail pockets. front cape. lace-up front.', 'jacket made of a bright design with texture. high skirt with long neck. front pockets. soft sides. zip-up inside.', 'jacket made of nice medium leather with texture. high belt over long chain. belt tube. ribbed trims. zip-up front

Synonim augmenter

In [27]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)


Original:
jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.
Augmented Text:
['jacket arrive at of a technical material with grain. high collar and foresightful sleeves. front pockets. ribbed trim. zip - upward front.', 'jacket make of a technical material with grain. high collar and long arm. front pocket. ribbed trimming. nothing - up front.', 'jacket crown make water of a technical fabric with texture. high pitched catch and long sleeves. front pockets. ribbed clipping. zip - up presence.']


In [28]:
# TODO: download model http://paraphrase.org/#/download
aug = naw.SynonymAug(aug_src='ppdb', model_path=os.environ.get("MODEL_DIR") + 'ppdb-2.0-s-all')
augmented_text = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

FileNotFoundError: [Errno 2] No such file or directory: './aug_modelppdb-2.0-s-all'

Backtranslation

In [31]:
import nlpaug.augmenter.word as naw

back_translation_aug = naw.BackTranslationAug(
    from_model_name='transformer.wmt19.en-de', 
    to_model_name='transformer.wmt19.de-en'
)
augmented_text = back_translation_aug.augment(text, n=5)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.
Augmented Text:
['Jacket in a technical fabric with texture. high collar and long sleeves. front pockets. Ribbed rim. zip-up front.', 'Jacket in a technical fabric with texture. high collar and long sleeves. front pockets. Ribbed rim. zip-up front.', 'Jacket in a technical fabric with texture. high collar and long sleeves. front pockets. Ribbed rim. zip-up front.', 'Jacket in a technical fabric with texture. high collar and long sleeves. front pockets. Ribbed rim. zip-up front.', 'Jacket in a technical fabric with texture. high collar and long sleeves. front pockets. Ribbed rim. zip-up front.']


## Other ideas to test

* Swap sentence ordering