In [None]:
USE_DRIVE = True
if USE_DRIVE:
    project_path  = "drive/MyDrive/EFREI_CAMP/"
    from google.colab import drive
    import subprocess
    subprocess.call(['pip', 'install', "git+https://github.com/amtam0/flair.git"])
    drive.mount('/content/drive', force_remount=True)
else:
    project_path  = "../"

In [None]:
from flair.embeddings import FlairEmbeddings, TokenEmbeddings, StackedEmbeddings, WordEmbeddings
from typing import List
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Sentence
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import TransformerWordEmbeddings
import torch
from torch.optim import Adam, AdamW
import flair
import os
import datetime
import shutil
from torch.optim.lr_scheduler import OneCycleLR
flair.set_seed(123) #reproductible dataset

In [None]:
# directory where the data resides
timestamp_folder = "" #toedit
models_folder = "training/models"
data_folder = os.path.join(project_path, "training/GT", timestamp_folder)
#directory where to save model
flair_trainer_path_cbrt = os.path.join(project_path, models_folder, "cbrt")

In [None]:
# define columns
columns = {0 : 'text', 1 : 'ner'}

# 1. initializing the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'train.txt',
                              dev_file = "dev.txt",
                              test_file= "test.txt"
                             )

# 2. what tag do we want to predict
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

### Check if Gpu is used, expected cell output : `CUDA`

In [None]:
if torch.cuda.is_available():
    print("CUDA")
    flair.device = torch.device('cuda') 
else:
    flair.device = torch.device('cpu') 

## Start Multi Models Training
### - Check `#FINETUNE` TAG for some finetunable parameters
### - Some links are attached for some tips 
### - Make sure to check `TransformerWordEmbeddings` and `trainer.train` Docstrings for more details

In [None]:
#PAPER https://aclanthology.org/C18-1139.pdf
#PAPER https://aclanthology.org/N19-4010.pdf

params_use_context = [False] #FINETUNE https://arxiv.org/abs/1903.08855
params_subtoken_pooling = ["first"] #FINETUNE https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
params_reproject_embeddings = [False] #FINETUNE
params_batch_size = [8] #FINETUNE
params_pretrained_models = ["roberta-base"] #FINETUNE https://huggingface.co/transformers/v2.3.0/pretrained_models.html

hidden_size = 256 #FINETUNE
EPOCHS = 10 #FINETUNE

for param_use_context in params_use_context:
    for param_subtoken_pooling in params_subtoken_pooling:
        for param_reproject_embeddings in params_reproject_embeddings:
            for param_batch_size in params_batch_size:
                for params_pretrained_model in params_pretrained_models:
                    Tensorboard_dir = "pr{}_uc{}_sp{}_re{}_bs{}".format(str(params_pretrained_model[:-3]), str(param_use_context)[:2], str(param_subtoken_pooling),
                                              str(param_reproject_embeddings)[0], str(param_batch_size))

                    # 4. initialize embeddings
                    #TIPS https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
                    embeddings_cbrt_base = TransformerWordEmbeddings(params_pretrained_model,
                                                                      subtoken_pooling=param_subtoken_pooling, 
                                                                      layers='-1', #FINETUNE
                                                                      allow_long_sentences=True, #FINETUNE
                                                                      fine_tune=True,
                                                                    layer_mean=True, #FINETUNE
                                                                    use_context=param_use_context) 

                    # 5. initialize sequence tagger
                    tagger_cbrt_base : SequenceTagger = SequenceTagger(hidden_size=hidden_size,
                                                                embeddings=embeddings_cbrt_base,
                                                                tag_dictionary=tag_dictionary,
                                                                tag_type=tag_type,
                                                                use_crf=False,
                                                                use_rnn=False,
                                                                reproject_embeddings=param_reproject_embeddings) 

                    # 6. train
                    trainer : ModelTrainer = ModelTrainer(tagger_cbrt_base, corpus)

                    trainer.train(os.path.join(flair_trainer_path_cbrt, Tensorboard_dir),
                                  optimizer=AdamW, #FINETUNE
                                  use_tensorboard=True,
                                  tensorboard_log_dir=os.path.join(project_path, models_folder, "runs", Tensorboard_dir),
                                  # metrics_for_tensorboard = [("macro avg", 'f1-score')] #default
                                  learning_rate=7e-5, #FINETUNE
                                  anneal_factor= 0.5, #FINETUNE
                                  patience=3, #FINETUNE
                                  max_epochs=EPOCHS,
                                  mini_batch_size=param_batch_size,
                                  min_learning_rate=0.0000001, #FINETUNE
                                  train_with_dev=False,
                                  checkpoint=False,
                                  save_final_model=False,
                                  monitor_test=True,
                                  embeddings_storage_mode ='gpu')

                    # to resume training
                    # checkpoint = './FLAIR-ner/{}/cbrt/checkpoint.pt'.format(timestamp_folder)
                    # trainerbis = ModelTrainer.load_checkpoint(checkpoint, corpus)                                                 

                    # remove cache from gpu/memory
                    torch.cuda.empty_cache()
                    del trainer, tagger_cbrt_base, embeddings_cbrt_base                                       