In [None]:
# Install SeaQuBe

In [None]:
pip install seaqube

In [None]:
## Import Augmentation class of the package

In [None]:
 # Import all Augmentation methods
from seaqube.augmentation.word import Active2PassiveAugmentation, EDAAugmentation, TranslationAugmentation, EmbeddingAugmentation
from seaqube.augmentation.char import QwertyAugmentation
from seaqube.augmentation.corpus import UnigramAugmentation


In [None]:
# import some tools
from seaqube.tools.io import load_json
from os.path import join

In [None]:
# load example data
import json,urllib.request
data = urllib.request.urlopen("https://raw.githubusercontent.com/bees4ever/SeaQuBe/master/examples/sick_full_corpus.json").read()

corpus = json.loads(data)
text = 'The quick brown fox jumps over the lazy dog .'



## Set Up all augmentations

In [None]:
# a (experimental) active to passive voice transformer. Only one sentences / doc to another
a2p = Active2PassiveAugmentation()

In [None]:
# easy-data augmentation method implementation (random word swap, insertion, deletion and replacement with synonyms)
eda = EDAAugmentation(max_length=2)

In [None]:
# translate text to other language and back (with Google Translater)
translate = TranslationAugmentation(max_length=2)

In [None]:
# replace words by a similiar one using another word embedding
embed = EmbeddingAugmentation(max_length=2)

In [None]:
# insert typos on text based on a qwerty-keyboard
qwerty = QwertyAugmentation(replace_rate=0.07, max_length=2)

In [None]:
# based on the UDA algorithm, only the Unigram method, which replace low meaning full words with other low meaning full words
# this method needs a corpus, because it need to detect low meaningfull words
unigram = UnigramAugmentation(corpus=corpus, max_length=2)

### API - Usage
**Every augmentation object have the same possibility**

In [None]:
# 1. augmenting a string - same syntax as NLPAUG (https://github.com/makcedward/nlpaug)
print(qwerty.augment(text))
# or
print(translate.augment(text))


In [None]:
# 2. augmenting a doc (token based text)
print(unigram.doc_augment(doc=corpus[0]))
# doc_augment can also handle text:
print(embed.doc_augment(text=text))


In [None]:
# 3. augmenting a whole corpus
print(eda(corpus[0:200]))


In [None]:
# 4. Active2Passive is still experimental:
a2p.doc_augment(doc=['someone', 'is', 'not', 'reading', 'the', 'email'])
