# Text Classification Example
Copyright (c) 2021 Robert Bosch GmbH

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public icense
along with this program.  If not, see <https://www.gnu.org/licenses/>.

In [None]:
from flair.datasets import ClassificationCorpus
from flair.tokenization import SpaceTokenizer

# Check out the following page for more information on corpora in flair
# https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md

corpus = ClassificationCorpus(
    'path/to/dataset/',           # path where train/dev/test files are stored (they have to be in the same directory)
    train_file = 'name_of_train.bio',
    test_file = 'name_of_test.bio',
    dev_file = 'name_of_dev.bio', # will be sampled from train if no file is provided
    label_type = 'question_type',
    tokenizer = SpaceTokenizer(), # you can remove this line and a "real" tokenizer is used
    memory_mode ='full'           # only if you can keep everything in memory; but required for FAME-features
)
lang = 'en'

label_dict = corpus.make_label_dictionary()
print(label_dict)

In [None]:
from flair.embeddings import CharacterEmbeddings, WordEmbeddings, TransformerWordEmbeddings
from src.embeddings import AveragingBytePairEmbeddings

# Check out the following page for more information on embeddings in flair
# https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
char = CharacterEmbeddings()
word = WordEmbeddings(lang)
bpemb = AveragingBytePairEmbeddings(language=lang)

# You should be able to use any huggingface transformer with these embeddings
transformer = TransformerWordEmbeddings(
    'xlm-roberta-large',
    layers = '-1',
    fine_tune = False,   # There is also the option to finetune these models
)

In [None]:
from src.features import add_features

features = add_features(corpus, use_frequencies_from_file=EMB_BASE_PATH + f'fasttext/cc.{lang}.300.vec')
feature_flags, features_dimensions, idx2shape, idx2word = features

In [None]:
from src.embeddings import MetaEmbeddings
from src.models import FeatureEmbedding

shape_hidden_size = 25  # Embedding size of shape embeddings
feature_flags = 'fslb'  # Indicate which features you want to use
                        # - (f)requency
                        # - (s)hape embedding
                        # - (l)ength
                        # - (b)asic shape information like HasCapitalLetter (12 one-hot encoded features)


feature_model = FeatureEmbedding(
    use_features=feature_flags, 
    feature_dims=features_dimensions, 
    idx2shape=idx2shape, 
    shape_dim=shape_hidden_size
)

# This is the drop-in replacement for the flair.embeddings.StackedEmbeddings
meta_embeddings = MetaEmbeddings(
    [char, bpemb, word, transformer],
    use_average=True,    # use averages of embeddings instead of concatenation
    use_attention=True,  # use attention to compute weighted averages
    use_features=True,   # You can also include word features in the attention function
                         # as proposed in our EMNLP paper
    feature_model=feature_model
)

In [None]:
from src.embeddings import DocumentRNNEmbeddings

# This will create document representations from the input embeddings
# based on a bidirectional LSTM
# There are some other DocumentEmbeddings available. Check them out if you want.
doc_embeds = DocumentRNNEmbeddings(
    meta_embeddings,
    bidirectional = True,
    rnn_type="LSTM",
    reproject_words=False,
    hidden_size=256,
    rnn_layers=1, 
)

In [None]:
from flair.models import TextClassifier

# This will create the actual text classification model
# All previous modules (MetaEmbeddings, DocumentEmbeddings, ...) are part of this
classifier = TextClassifier(doc_embeds, label_dictionary=label_dict)

In [None]:
from src.models import DomainClassifier

# Create the discrminator for adversarial training
domain_c = DomainClassifier(
    meta_embeddings.embedding_length,
    meta_embeddings.embedding_length,
    meta_embeddings.num_embeddings,
    dropout = 0.2,
    lambd = 1.0e-4
)

In [None]:
from torch.optim import AdamW
from src.trainer import AdversarialModelTrainer

# Create our Trainer that does NER and adversarial training and start the training
trainer = AdversarialModelTrainer(classifier, corpus, D=domain_c, optimizer=AdamW)

trainer.train(
    'taggers/name/',
    learning_rate=5.0e-6,
    mini_batch_size=64,
    max_epochs=100,
    train_with_dev=True,
    adversarial_learning_k=10,
)