In [None]:
USE_DRIVE = True
if USE_DRIVE:
    project_path  = "drive/MyDrive/EFREI_CAMP/"
    from google.colab import drive
    import subprocess
    subprocess.call(['pip', 'install', "git+https://github.com/amtam0/flair.git"])
    drive.mount('/content/drive', force_remount=True)
else:
    project_path  = "../"

In [None]:
from flair.embeddings import FlairEmbeddings, TokenEmbeddings, StackedEmbeddings, WordEmbeddings
from typing import List
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Sentence
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import TransformerWordEmbeddings
import torch
from torch.optim import Adam, AdamW
import flair
import os
import datetime
import shutil
from torch.optim.lr_scheduler import OneCycleLR

import pandas as pd
flair.set_seed(123) #reproductible dataset

In [None]:
# directory where the data resides
timestamp_folder = "" #toedit
models_folder = "training/models"
data_folder = os.path.join(project_path, "training/GT", timestamp_folder)

#directory where to save model
flair_trainer_path_seq = os.path.join(project_path, models_folder, "seq")

In [None]:
flair_trainer_path_seq

In [None]:
# define columns
columns = {0 : 'text', 1 : 'ner'}

# 1. initializing the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'train.txt',
                              dev_file = "dev.txt",
                             test_file= "test.txt"
                             )

# 2. what tag do we want to predict
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

### Check if Gpu is used, expected cell output : `CUDA`

In [None]:
if torch.cuda.is_available():
    print("CUDA")
    flair.device = torch.device('cuda') 
else:
    flair.device = torch.device('cpu') 

## Start Multi Models Training
### - Check `#FINETUNE` TAG for some finetunable parameters
### - Some links are attached for some tips 
### - Make sure to check `TransformerWordEmbeddings` and `trainer.train` Docstrings for more details

In [None]:
#PAPER https://aclanthology.org/C18-1139.pdf
#PAPER https://aclanthology.org/N19-4010.pdf

params_subtoken_pooling = ["first_last"] #FINETUNE https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
params_use_context = [False] #FINETUNE https://arxiv.org/pdf/2011.06993.pdf voir cet article pour comprendre use context
params_reproject_embeddings = [True] #FINETUNE
params_batch_size = [32] #FINETUNE
params_embs = ["all"] #FINETUNE ["Glove", "Fastext", "both", "Cbrt", "all"] https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
params_lr = [0.2] #FINETUNE factor to anneal learning rate ==> near 1 (no annealing)
params_use_flair_Embs = [False] #FINETUNE
params_pretrained_models = ["roberta-base"] #FINETUNE https://huggingface.co/transformers/v2.3.0/pretrained_models.html

hidden_size = 256
EPOCH = 50 #FINETUNE

for param_use_context in params_use_context:
    for param_subtoken_pooling in params_subtoken_pooling:
        for param_reproject_embeddings in params_reproject_embeddings:
            for param_batch_size in params_batch_size:
                for param_embs in params_embs:
                    for param_lr in params_lr:
                        for params_use_flair_Emb in params_use_flair_Embs:
                            for params_pretrained_model in params_pretrained_models:
                       
                                Tensorboard_dir = "lstm_uc{}_sp{}_re{}_bs{}_emb{}_fl{}_lr{}_layersAll".format(str(param_use_context)[:2], str(param_subtoken_pooling),
                                                            str(param_reproject_embeddings)[0], str(param_batch_size),
                                                            str(param_embs)[:], str(params_use_flair_Emb)[:],str(param_lr))

                                # 4. initialize embeddings

                                embeddings_cbrt_base = TransformerWordEmbeddings(params_pretrained_model,
                                                                                subtoken_pooling=param_subtoken_pooling, 
                                                                                allow_long_sentences=True, #FINETUNE
                                                                                layer_mean=True, #FINETUNE
                                                                                use_context=param_use_context)
                                if params_use_flair_Emb:
                                    embedding_types = [
                                    FlairEmbeddings('fr-forward'),
                                    FlairEmbeddings('fr-backward'),
                                ]
                                else:
                                    embedding_types = []
                                
                                if param_embs=="Glove":
                                    embedding_types.insert(0, WordEmbeddings('glove'))
                                elif param_embs=="Fastext":
                                    embedding_types.insert(0, WordEmbeddings('fr'))
                                elif param_embs=="both":
                                    embedding_types = [WordEmbeddings('glove'), WordEmbeddings('fr')] + embedding_types
                                elif param_embs=="Cbrt":
                                    embedding_types.insert(0, embeddings_cbrt_base)
                                elif param_embs=="all":
                                    embedding_types = [WordEmbeddings('glove'), WordEmbeddings('fr'), embeddings_cbrt_base] + embedding_types
                                embeddings_seq: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

                                # 5. initialize sequence tagger

                                tagger_seq : SequenceTagger = SequenceTagger(hidden_size=hidden_size,
                                                        embeddings=embeddings_seq,
                                                        tag_dictionary=tag_dictionary,
                                                        tag_type=tag_type,
                                                        use_crf=True,
                                                            reproject_embeddings=param_reproject_embeddings)

                                # 6. train   
                                trainer : ModelTrainer = ModelTrainer(tagger_seq, corpus)
                                trainer.train(os.path.join(flair_trainer_path_seq, Tensorboard_dir),
                                            use_tensorboard=True,
                                            tensorboard_log_dir=os.path.join(project_path, models_folder, "runs", Tensorboard_dir),
                                            learning_rate=0.1, #FINETUNE
                                            anneal_factor=param_lr,
                                            patience=3, #FINETUNE
                                            mini_batch_size=param_batch_size,
                                            min_learning_rate=0.00001, #FINETUNE
                                            max_epochs=EPOCH,
                                            train_with_dev=False,#FINETUNE
                                            checkpoint=False,
                                            save_final_model=False,
                                            monitor_test=True, #FINETUNE
                                            embeddings_storage_mode ='gpu')

                #                 checkpoint = os.path.join(flair_trainer_path_seq,"checkpoint.pt")
                #                 trainerbis = ModelTrainer.load_checkpoint(checkpoint, corpus)

                                # remove cache from gpu/memory
                                torch.cuda.empty_cache()
                                del trainer, tagger_seq, embeddings_seq, embedding_types                         