In [1]:
# the following file contains the script to train the NER models on the AWS servers

In [None]:
# !unzip Archive.zip

# download the dependencies
import sys
!{sys.executable} -m pip install allennlp==0.9.0 flair

# import the dependencies
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.embeddings import FlairEmbeddings, TransformerWordEmbeddings, StackedEmbeddings, ELMoEmbeddings
from flair.datasets import DataLoader
import random

# the class for training the models
class Trainer:
    
    # init the class
    def __init__(self, dataset_i,  sen_size, train_size=1.0, embedding_types=['flair'], use_crf=True, hidden_size=256,
                learning_rate=0.05, batch_size=32, max_epochs=150):
        
        
        self.data_columns = {0: 'text', 1: 'ner'}
        self.tag_type = 'ner'
        self.data_folder = './s_' + sen_size + '/' + str(dataset_i)
        self.embedding_types = embedding_types
        self.use_crf = use_crf
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.train_size = train_size
        self.batch_size = batch_size
        self.max_epochs = max_epochs
        self.output_folder = "resources/models"
        
        print(f'Creating corpus')
        self.create_corpus()
        
        print(f'Initializing embeddings: {" ".join(map(str, self.embedding_types))}')
        self.set_embeddings()
        
        print(f'Initializing model')
        self.set_train_model()
        
        print(f'Initialization done')
    
    # read the sentences from the file
    def get_sentences(self, size):
        sentences = []
        f = open(self.data_folder + '/train.txt', 'r')
        lines = f.readlines()
        f.close()
        sentence = []
        for line in lines:
            if line == '\n':
                sentences.append(sentence)
                sentence = []
            else:
                sentence.append(line)

        length = len(sentences)

        random.shuffle(sentences)

        return sentences[:round(length*size)]
    
    # create a new training file with size s
    def make_file(self, sentences, size):
        
        f_name = 'train_' + str(int(size*100)) + '.txt'
        path = self.data_folder +  '/' + f_name

        f = open(path, 'w')

        for sentence in sentences:
            for token in sentence:
                f.write(token)

            f.write('\n')

        f.close()

        return f_name

    def make_train_file(self, size):
        tr_sentences = self.get_sentences(size)
        path_new_file = self.make_file(tr_sentences, size)
        return path_new_file
    
    # create the corpus for the trainer
    def create_corpus(self):
        new_train_file = self.make_train_file(self.train_size)
        
        self.corpus: Corpus = ColumnCorpus(self.data_folder, self.data_columns,
                              train_file=new_train_file,
                              test_file='test.txt',
                              dev_file='dev.txt')
            
        self.tag_dictionary = self.corpus.make_tag_dictionary(tag_type=self.tag_type)
        
    def get_tag_dictionary(self):
        return self.tag_dictionary
    
    
    def get_subset(self, subset):
        if subset == 'train':
            return self.corpus.train
        
        if subset == 'test':
            return self.corpus.test
        
        if subset == 'dev':
            return self.corpus.dev
    
    # set the embeddings for the trainer
    def set_embeddings(self):
        
        if len(self.embedding_types) < 1:
            print(f'No embeddings selected')
            
        else:
            embeddings = []

            if 'flair' in self.embedding_types:
                embeddings.append(FlairEmbeddings('news-forward-fast'))
                embeddings.append(FlairEmbeddings('news-backward-fast'))

            if 'elmo' in self.embedding_types:
                embeddings.append(ELMoEmbeddings())

            if 'bert' in self.embedding_types:
                embeddings.append(TransformerWordEmbeddings('bert-base-cased'))

            self.embeddings = StackedEmbeddings(embeddings=embeddings)
            
    # set the trainer     
    def set_train_model(self):
        
        self.tagger: SequenceTagger = SequenceTagger(hidden_size=self.hidden_size,
                                        embeddings=self.embeddings,
                                        tag_dictionary=self.tag_dictionary,
                                        tag_type=self.tag_type,
                                        use_crf=self.use_crf)
            
        self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)
    
    # train the NER model
    def train(self):
        
        self.trainer.train(
            self.output_folder,
            learning_rate=self.learning_rate,
            mini_batch_size=self.batch_size,
            max_epochs=self.max_epochs,
        )
              
# parameters              
sentence_sizes = ['50']
train_sizes = [0.2, 0.4, 0.6, 0.8]
data_sets = ['2', '3', '4', '5']
e_types = ['elmo']
              
import os

try:
    # Create target Directory
    os.mkdir('results')
except FileExistsError:
    print("Directory already exists")
    
for sentence_size in sentence_sizes:
    
    try:
        # Create target Directory
        os.mkdir('./results/'+ sentence_size)
    except FileExistsError:
        print("Directory already exists")
    
    # do multiple runs
    for data_set in data_sets:
        for train_size in train_sizes:
            t = Trainer(data_set,  sentence_size, train_size, e_types, use_crf=True)
            t.train()

            columns = {0: 'text', 1: 'ner'}

            data_folder = './s_' + sentence_size + '/' + data_set

            tag_type = 'ner'

            corpus: Corpus = ColumnCorpus(data_folder, columns,
                                          train_file='train_' + str(int(train_size*100)) + '.txt',
                                          test_file='test.txt',
                                          dev_file='dev.txt')

            tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

            model = SequenceTagger.load('./resources/models/best-model.pt')


            eval_data = DataLoader(corpus.test, batch_size=1)

            result, score = model.evaluate(eval_data.dataset, out_path=f"./results/" + sentence_size +  "/predictions_" + data_set  +".txt")

            f = open("./results/" + sentence_size + '/results_' + data_set + '_' + str(int(train_size*100)) + ".txt", "w")
            f.write(result.detailed_results)
            f.close()

