In [None]:
! pip install flair

# Load data

In [None]:
! wget https://www.ims.uni-stuttgart.de/documents/ressourcen/korpora/tiger-corpus/download/tigercorpus-2.2.conll09.tar.gz
! tar -xvzf tigercorpus-2.2.conll09.tar.gz
! ls

In [None]:
! wget https://sites.google.com/site/empirist2015/home/shared-task-data/empirist_gold_cmc.zip
! wget https://sites.google.com/site/empirist2015/home/shared-task-data/empirist_gold_web.zip

! unzip empirist_gold_cmc.zip
! unzip empirist_gold_web.zip

In [None]:
ls

In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus


# set seed to always get the same data splits 
import random
random.seed(0)

# define columns
columns = {0: 'id', 1: 'text', 2: 'lemma', 3: 'placeholder', 4: 'pos'}

# this is the folder in which train, test and dev files reside
data_folder = '.'

split = 0.7

In [None]:
# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='tiger_release_aug07.corrected.16012013.conll09',)
#                                                            test_file='test.txt',
#                                                            dev_file='dev.txt')

corpus: Corpus = corpus.downsample(split, downsample_train=True, downsample_dev=False, downsample_test=True)
corpus.obtain_statistics()

In [None]:
print('EXAMPLE SEQUENCE', corpus.test[1].to_tagged_string('pos'))
print('# TRAINING SEQUENCE', len(corpus.train))

# Train Model

In [None]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from torch.optim.adam import Adam
from typing import List

from flair.models import SequenceTagger

from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric

from flair.visual.training_curves import Plotter

In [None]:
# 1. get the corpus


# 2. what tag do we want to predict?
tag_type = 'pos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

In [None]:
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    # WordEmbeddings('de'),
    FlairEmbeddings('german-forward'),
    FlairEmbeddings('german-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
save_to = '{}/taggers/pos_tiger{}'.format(data_folder, split)
trainer.train(save_to,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=20,
              embeddings_storage_mode='none',
              # checkpoint=True
             )

# 8. plot training curves (optional)
plotter = Plotter()
plotter.plot_training_curves(save_to + '/loss.tsv')
plotter.plot_weights(save_to + '/weights.txt')


# Evaluation

### 1. prepare test data

In [None]:
from sklearn.metrics import classification_report
import csv
import pandas as pd
import numpy as np
import re

test_file = open(data_folder + "/test/test.tsv", encoding="utf-8")
test_cmc_file = open(data_folder + "/test/cmc_test.tsv", encoding="utf-8")
test_web_file = open(data_folder + "/test/web_test.tsv", encoding="utf-8")

test = pd.read_csv(test_file, sep=' ', names = ["word", "label", "pred"])
cmc_test = pd.read_csv(test_cmc_file, sep='\t', names = ["word", "label"])
web_test = pd.read_csv(test_web_file, sep='\t', names = ["word", "label"])

In [None]:
! more taggers/pos_tiger0.7/test.tsv

In [None]:
test[test.label != test.pred].head(5)

In [None]:
print(test.shape)
print(cmc_test.shape)
print(web_test.shape)
web_test.head(6)

In [None]:
test_words = test.word.tolist()
cmc_words = cmc_test.word.tolist()
web_words = web_test.word.tolist()

test_labels = test.label.tolist()
cmc_labels = cmc_test.label.tolist()
web_labels = web_test.label.tolist()

print(len(test_labels))
print(len(set(test_labels)))
labels = list(set(test_labels))
print(labels[:10])

set_labels_cmc = list(set(cmc_labels))
set_labels_web = list(set(web_labels))
print(len(set_labels_cmc))

In [None]:
from sklearn import preprocessing

lb = preprocessing.LabelBinarizer(sparse_output=True)
lb.fit(set_labels_cmc)

In [None]:
y_test = lb.transform(test_labels)
y_cmc = lb.transform(cmc_labels)
y_web = lb.transform(web_labels)

In [None]:
X_test = " ".join(test_words)
X_cmc = " ".join(cmc_words)
X_web = " ".join(web_words)

### 2. evaluate the model

In [None]:
from flair.data import Sentence

model_005 = SequenceTagger.load_from_file('{}/taggers/pos_tiger0.05/best-model.pt'.format(data_folder))
model_01 = SequenceTagger.load_from_file('{}/taggers/pos_tiger0.1/best-model.pt'.format(data_folder))
model_05 = SequenceTagger.load_from_file('{}/taggers/pos_tiger0.5/best-model.pt'.format(data_folder))
model_1 = SequenceTagger.load_from_file('{}/taggers/pos_tiger1.0/best-model.pt'.format(data_folder))

In [None]:
model = SequenceTagger.load_from_file('{}/taggers/pos_tiger0.5/best-model.pt'.format(data_folder))

# create example sentence
sentence1 = Sentence('Mit der Ablehnung des Scheidungsabkommens zwischen dem Vereinigten Königreich und der EU dauert die Ungewissheit an .')
sentence2 = Sentence('Die zentrale Weichenstellung für das Land dürfte nun weiter aufgeschoben werden .')

# predict tags and print
model.predict(sentence1)
model.predict(sentence2)

print(sentence1.to_tagged_string())
print(sentence2.to_tagged_string())

In [None]:
def predict(test_set, model):
  if test_set == "CMC":
    sentences = Sentence(X_cmc)
    preds = model.predict(sentences)
    return sentences.to_tagged_string()

  if test_set == "WEB":   
    sentences = Sentence(X_web)
    preds = model.predict(sentences)
    return sentences.to_tagged_string()

  if test_set == "TEST":    
    sentences = Sentence(X_test)
    preds = model.predict(sentences)
    return sentences.to_tagged_string()

In [None]:
print(predict(test_set="WEB", model=model_1))

In [None]:
def evaluate(test_set, model):
  with open(data_folder + "/test/eval_results/preds{}_tagger{}.txt".format(test_set, model), "r") as fin:
    preds_string = fin.read()
    preds_list = re.split(" <|> ", preds_string)
    preds_array = np.array(preds_list).reshape(-1,2)
    preds_df = pd.DataFrame(preds_array, columns = ["word", "pred"])
    print(preds_df.head(3))
    preds_labels = preds_df.pred.tolist()
    
    if test_set == "CMC":
      print(len(preds_labels))
      print(len(cmc_labels))
      assert len(preds_labels) == len(cmc_labels)

      y_pred = lb.transform(preds_labels)
      print(classification_report(y_cmc, y_pred, target_names=set_labels_cmc))
    
    if test_set == "WEB":
      print(len(preds_labels))
      print(len(web_labels))
      assert len(preds_labels) == len(web_labels)

      y_pred = lb.transform(preds_labels)
      print(classification_report(y_web, y_pred, target_names=set_labels_web))
      
    if test_set == "TEST":
      print(len(preds_labels))
      print(len(test_labels))
      assert len(preds_labels) == len(test_labels)

      y_pred = lb.transform(preds_labels)
      print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
"""before running the function it is important to reassure that the LabelBinarizer
is fitted on the "right" lable set. CMC and WEB test sets contain less number of
lables than the original TEST set. Then y_preds and y_true must be transformed
with the correct LabelBinarizer correspondently"""

evaluate(test_set="CMC", model="0.1")

# Tuning the model

In [None]:
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter

from flair.hyperparameter.param_selection import SequenceTaggerParamSelector, OptimizationValue

In [None]:
# 1. define your search space
search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
    [
    #WordEmbeddings('de'),
    FlairEmbeddings('german-forward', use_cache=True),
    FlairEmbeddings('german-backward', use_cache=True)],
    #[FlairEmbeddings('german-forward', use_cache=True),
    #FlairEmbeddings('german-backward', use_cache=True)]
])
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128])
#search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
#search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32])

# 2. create the parameter selector
save_optim_to = '{}/optim_results'.format(data_folder)
param_selector = SequenceTaggerParamSelector(
    corpus, 
    tag_type, 
    save_optim_to, 
    max_epochs=20,
    evaluation_metric = EvaluationMetric.MICRO_F1_SCORE,
    training_runs=3,
    optimization_value=OptimizationValue.DEV_SCORE
)

In [None]:
# 3. start the optimization
param_selector.optimize(search_space, max_evals=100)