In [None]:
# Install SeaQuBe

In [10]:
pip install seaqube

Collecting seaqube
  Downloading seaqube-0.0.9b0-py3-none-any.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 4.5 MB/s eta 0:00:01


Installing collected packages: seaqube
Successfully installed seaqube-0.0.9b0
Note: you may need to restart the kernel to use updated packages.


In [None]:
## Import Augmentation class of the package

In [1]:
 # Import all Augmentation methods
from seaqube.augmentation.word import Active2PassiveAugmentation, EDAAugmentation, TranslationAugmentation, EmbeddingAugmentation
from seaqube.augmentation.char import QwertyAugmentation
from seaqube.augmentation.corpus import UnigramAugmentation


In [2]:
# import some tools
from seaqube.tools.io import load_json
from os.path import join

*We need some infos where this notebook is*

In [None]:
pwd

In [4]:
# insert `pwd` output here
main_path = ''

In [5]:
text = 'The quick brown fox jumps over the lazy dog .'
corpus = load_json(join(main_path, "sick_full_corpus.json"))

## Set Up all augmentations

In [6]:
# a (experimental) active to passive voice transformer. Only one sentences / doc to another
a2p = Active2PassiveAugmentation()

In [7]:
# easy-data augmentation method implementation (random word swap, insertion, deletion and replacement with synonyms)
eda = EDAAugmentation(max_length=2)

In [8]:
# translate text to other language and back (with Google Translater)
translate = TranslationAugmentation(max_length=2)

In [9]:
# replace words by a similiar one using another word embedding
embed = EmbeddingAugmentation(max_length=2)

In [10]:
# insert typos on text based on a qwerty-keyboard
qwerty = QwertyAugmentation(replace_rate=0.07, max_length=2)

In [11]:
# based on the UDA algorithm, only the Unigram method, which replace low meaning full words with other low meaning full words
# this method needs a corpus, because it need to detect low meaningfull words
unigram = UnigramAugmentation(corpus=corpus, max_length=2)

### API - Usage
**Every augmentation object have the same possibility**

In [12]:
# 1. augmenting a string - same syntax as NLPAUG (https://github.com/makcedward/nlpaug)
print(qwerty.augment(text))
# or
print(translate.augment(text))


100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00


['the quic< brown fox jumps over the laDy dog.', 'the quick brown fox Uumps over the lazy dog.']


100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (1 of 1) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00


['the quick brown fox jumps over the lazy dog.', 'the quick brown fox jumps over the lazy dog.', 'the fast brown fox jumps over the lazy dog.', 'the quick brown fox jumps over the lazy dog.', 'a quick brown fox jumps over a lazy dog.', 'the quick brown fox jumps over the lazy dog.', 'the quick brown fox jumps over the lazy dog.', 'the fast brown fox jumps over the lazy dog.']


In [13]:
# 2. augmenting a doc (token based text)
print(unigram.doc_augment(doc=corpus[0]))
# doc_augment can also handle text:
print(embed.doc_augment(text=text))


[['a', 'child', 'is', 'splashing', 'in', 'a', 'pool', 'for', 'children', 'and', 'no', 'other', 'children', 'are', 'playing', 'in', 'the', 'background']]
[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.'], ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'same']]


In [14]:
# 3. augmenting a whole corpus
print(eda(corpus[0:200]))


 95% (190 of 200) |####################  | Elapsed Time: 0:00:00 ETA:   0:00:00

[['a', 'persons', 'is', 'kicking', 'at', 'a', 'monkey', 'glove'], ['a', 'monkey', 'is', 'kicking', 'at', 'a', 'persons', 'glove'], ['there', 'is', 'no', 'man', 'wiping', 'the', 'march', 'of', 'a', 'bowl'], ['there', 'is', 'no', 'man', 'nobelium', 'wiping', 'the', 'edge', 'of', 'a', 'bowl'], ['a', 'young', 'chaff', 'is', 'jumping', 'into', 'water'], ['a', 'kid', 'is', 'jumping', 'into', 'water'], ['a', 'cord', 'is', 'being', 'cost', 'climbed', 'by', 'a', 'man'], ['a', 'cord', 'is', 'climbed', 'by'], ['a', 'is', 'falling', 'horse', 'on', 'a', 'track', 'and', 'laid', 'in', 'the', 'wild'], ['a', 'is', 'falling', 'off', 'a', 'horse', 'on', 'track', 'and', 'is', 'laid', 'in', 'the', 'wild'], ['a', 'white', 'dog', 'is', 'approaching', 'golden', 'dog', 'beach'], ['a', 'white', 'dog', 'is', 'approaching', 'dog', 'iron', 'a', 'golden', 'dog', 'on', 'pebbly', 'beach'], ['people', 'are', 'take', 'the', 'air', 'through', 'a', 'crowd', 'in', 'the', 'street'], ['citizenry', 'are', 'walking', 'through

In [15]:
# 4. Active2Passive is still experimental:
a2p.doc_augment(doc=['someone', 'is', 'not', 'reading', 'the', 'email'])


[['The', 'email', 'is', 'not', 'being', 'read', 'by', 'someone']]

***We want to apply a method on a corpus, train a model and measure the performance***

In [16]:
# tidy up RAM
del unigram, embed, translate

In [17]:
corpus_augmented = eda(corpus[0:200]) # augment a small subset


 90% (180 of 200) |###################   | Elapsed Time: 0:00:00 ETA:   0:00:00

In [18]:
# To use NLP models which matching to or benchmark tool set, it must implement the 'BaseModelWrapper' interface.
# We set up a class who implements the fasttext nlp model from the gensim package.
# This is only needed to get the benchmark run

from gensim.models import FastText
from seaqube.nlp.tools import gensim_we_model_to_custom_we_model
from seaqube.nlp.seaqube_model import BaseModelWrapper
from seaqube.nlp.seaqube_model import SeaQuBeCompressLoader

class BaseFTGensimModel(BaseModelWrapper):
    def get_config(self):
        return dict(sg=self.model.sg, cbow_mean=self.model.cbow_mean, size=self.model.vector_size,
                    alpha=self.model.alpha, min_alpha=self.model.min_alpha, min_n=self.model.wv.min_n,
                    max_n=self.model.wv.max_n, window=self.model.window, min_count=self.model.vocabulary.min_count,
                    sample=self.model.vocabulary.sample, negative=self.model.negative, workers=self.model.workers,
                    class_name=str(self))

    def _wrap_nlp_model(self, model):
        return gensim_we_model_to_custom_we_model(model)

    
class FTModelStd500V5(BaseFTGensimModel):
    def define_epochs(self):
        return 100

    def define_model(self):
        return FastText(sg=1, cbow_mean=1, size=300, alpha=0.025, min_alpha=0.0001, min_n=1, max_n=5,
                        window=5, min_count=1, sample=0.001, negative=5, workers=self.cpus - 1)

    def define_training(self):
        self.model.build_vocab(sentences=self.data, update=False)
        self.model.train(sentences=self.data, total_examples=len(self.data), epochs=self.epochs)

model = FTModelStd500V5()

# train the model
# model.train_on_corpus(corpus_augmented)

# get a dumped model to store it on disk - or use it in another process
# model.get()
# dill_dumper(model.get(), "example_model.dill")
# or to save a compressed model:
# SeaQuBeCompressLoader.save_model_compressed(model.get(), "example_model_compressed.dill")
nlp = SeaQuBeCompressLoader.load_compressed_model(join(main_path, "example_model_compressed.dill"), "example")

del model

## Benchmark Model

In [19]:
## import tools
from seaqube.benchmark.wordanalogy import WordAnalogyBenchmark
from seaqube.benchmark.wordsimilarity import WordSimilarityBenchmark



In [40]:
# We need to install `vec4ir`, cna be done trough "SeaQuBe":
# from seaqube.benchmark.corpus4ir import Corpus4IRBenchmark
from seaqube import download;download('vec4ir')

In [20]:
import vec4ir

In [21]:
# load module
from seaqube.benchmark.corpus4ir import Corpus4IRBenchmark

In [22]:
# perform semantical tests
wsb = WordSimilarityBenchmark(test_set='simlex999')
print(wsb(nlp.model))  # score=0.008905456556563954

wab = WordAnalogyBenchmark('google-analogies')
print(wab(nlp.model))  # score=0.0

c4ir = Corpus4IRBenchmark(corpus[0:200])  # need the original corpus for setting up IR
print(c4ir(nlp.model))

| | #                                               | 998 Elapsed Time: 0:00:00
- | #                                               | 920 Elapsed Time: 0:00:00

(score=0.09096051125765903, payload={'pearson': (0.09096051125765903, 0.6032966430268072), 'matched_words': 35})


| |                      #                        | 19543 Elapsed Time: 0:00:02


(score=0.0, payload={'matched_words': 20})


100% (200 of 200) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


Embedding shape: (400000, 50)


  0% (1 of 200) |                        | Elapsed Time: 0:00:00 ETA:   0:00:22

Embedding shape: (628, 300)


100% (200 of 200) |######################| Elapsed Time: 0:00:23 Time:  0:00:23


(score=0.7895434003428393, payload={'tp': 20266, 'fn': 718, 'fp': 10086})
