# Initialize

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import collections, pandas as pd, numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from libact.query_strategies import UncertaintySampling, RandomSampling
# from active_learning_seq import RandomSamplingWithRetraining
import torch

from actleto import ActiveLearner, ActiveLearnerUiWidget, make_libact_strategy_ctor
from actleto.annotator.visualizers.seq_annotation import SeqAnnotationVisualizer

# Load dataset

In [3]:
from flair.datasets import ColumnCorpus
from model_wrappers import find_in_between, convert_y_to_bio_format
from bert_sequence_tagger.bert_utils import make_bert_tag_dict_from_flair_corpus
from libact_bert_creator import prepare_corpus
from utils_data import create_helper, convert_y_to_dict_format
import random

diagnosis = 'fib'

data_folder = '../../../data/' + diagnosis
corpus = ColumnCorpus(data_folder, {0 : 'text', 1 : 'ner'},
                                train_file='train.txt',
                                test_file='test.txt',
                                dev_file='dev.txt') # We do not need dev set

# Creating tag dictionaries
idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)
tags = list(set((tag.split('-')[1] for tag in idx2tag if len(tag.split('-')) > 1)))
print('Tags:', tags)


# Convert into the format suitable for training
X_train, y_train = prepare_corpus(corpus.train)
X_test, y_test = prepare_corpus(corpus.test)

# Shuffle X_train and y_train

def check_first_I_error(data, name):
    counter = 0
    for seq in data:
        if seq[0]=='I-'+diagnosis:
            seq[0] = 'B-'+diagnosis
            counter+=1
        for i in range(1, len(seq)):
            if seq[i-1]=='O' and seq[i]=='I-'+diagnosis:
                seq[i] = 'B-'+diagnosis
                counter+=1
    print("I've corrected", counter, "errors in", name, "dataset" )
    return data

y_train = check_first_I_error(y_train, 'train')
y_test = check_first_I_error(y_test, 'test')
    

# Convert into the format suitable for visualization
y_train_dict = convert_y_to_dict_format(X_train, y_train)
X_helper = create_helper(X_train)

2020-06-07 13:52:02,005 Reading data from ../../../data/fib
2020-06-07 13:52:02,006 Train: ../../../data/fib/train.txt
2020-06-07 13:52:02,006 Dev: ../../../data/fib/dev.txt
2020-06-07 13:52:02,007 Test: ../../../data/fib/test.txt
Tags: ['fib']
I've corrected 13 errors in train dataset
I've corrected 4 errors in test dataset


In [4]:
len(y_train)

8936

# Create seeding examples

In [5]:
from utils_data import sample_seed_elements_for_al


y_seed_dict = sample_seed_elements_for_al(y_train_dict, negative_size=100, 
                                          positive_size=25, random_seed=123)

print('Number of seed examples', len([e for e in y_seed_dict if e is not None]))

Number of seed examples 125


In [6]:

# [i for i in y_seed_dict if i is not None or not np.nan]


# Create model and active learner

In [7]:
BATCH_SIZE = 32
MAX_LEN = 100
PRED_BATCH_SIZE = 500
N_EPOCHS = 30

N_SAMPLES_PER_AL_ITER = 30
LEARNING_RATE = 5e-5
VALIDATION_RATIO = 0.1

PATIENCE = 1

BERT_MODEL_TYPE = '../../../data/Ru_bert_model/'
CACHE_DIR = '../../../cache_'+diagnosis+'/cache'

RANDOM_STATE = 2019
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x7fe551888070>

# Loading saved seed

In [8]:
# comment this cell if you don't have saved seed
# import numpy as np

# y_seed_dict = np.load(CACHE_DIR+'.npy', allow_pickle=True).tolist()

# for i in range(len(y_seed_dict)):
#     if y_seed_dict[i] is not None:
#         if 'None' in y_seed_dict[i]:
#             y_seed_dict[i] = None

In [9]:
class RandomSamplingWithRetraining(RandomSampling):
    def __init__(self, *args, **kwargs):
        self.model = kwargs.pop('model', None)
        
        super().__init__(*args, **kwargs)
        
        self.model.train(self.dataset)
        
    def update(self, indexes, labels):
        self.model.train(self.dataset, indexes)

    def make_query(self):
        return super().make_query()

In [10]:
from libact_bert_creator import LibActBertCreator

bert_creator = LibActBertCreator(idx2tag=idx2tag,
                                 tag2idx=tag2idx,
                                 tokenizer_name=BERT_MODEL_TYPE, 
                                 bert_model_type=BERT_MODEL_TYPE,
                                 cache_dir=CACHE_DIR,
                                 n_epochs=N_EPOCHS,
                                 lr=LEARNING_RATE,
                                 bs=BATCH_SIZE,
                                 ebs=PRED_BATCH_SIZE,
                                 patience=PATIENCE)



# active_learn_alg_ctor = make_libact_strategy_ctor(lambda trn_ds: RandomSamplingWithRetraining(trn_ds, 
#                                                                                      model=bert_creator(
#                                                                                          valid_ratio=VALIDATION_RATIO,
#                                                                                          retrain_epochs=N_EPOCHS,
#                                                                                          autofill_similar_objects=True,
#                                                                                          n_upsample_positive=0.)
#                                                                                     ),
#                                                   max_samples_number=30)


active_learn_alg_ctor = make_libact_strategy_ctor(lambda trn_ds: UncertaintySampling(trn_ds, 
                                                                                     model=bert_creator(
                                                                                         valid_ratio=VALIDATION_RATIO,
                                                                                         retrain_epochs=N_EPOCHS,
                                                                                         autofill_similar_objects=True,
                                                                                         n_upsample_positive=0.)
                                                                                    ),
                                                  max_samples_number=30)

# Creating ActiveLearning object that implements AL logic.
active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor,
                               X_full_dataset=X_helper.texts.tolist(),
                               y_full_dataset=y_seed_dict,
                               rnd_start_steps=0)

active_learner.start()

New indexes None
X shape (125,)
y shape (125,)
Number of all training examples:  112


Epoch:  20%|██        | 6/30 [00:06<00:25,  1.05s/it]


# Creating widget for annotation

In [11]:
preliminary_path = CACHE_DIR
# This try-catch block is needed to stop autosave thread in 
#case we invoke the cell multiple times.
try:
    if active_learn_ui:
        active_learn_ui.stop()
except NameError:
    pass

# Creaing the active learner widget itself and configure 
# it with active_learner, X_helper.
active_learn_ui = ActiveLearnerUiWidget(active_learner=active_learner,
                                        X_helper=X_helper,
                                        display_feature_table=False,
                                        drop_labels=[],
                                        y_labels=None,
                                        save_path=preliminary_path,
                                        save_time=120, 
                                        visualizer=SeqAnnotationVisualizer(tags=tags))

active_learn_ui

ActiveLearnerUiWidget(children=(HBox(children=(Button(description='Next iteration', style=ButtonStyle()), Labe…

# Evaluate

In [12]:
results = []
results_s = []
precisions = []
precisions_s = []
recalls = []
recalls_s = []

In [13]:
running = 6
epoch = 0

In [14]:
epoch

0

In [15]:
from bert_sequence_tagger.bert_utils import prepare_flair_corpus
from bert_sequence_tagger.metrics import f1_entity_level, f1_token_level

def get_seq_tagger(active_learner):
    return active_learner._active_learn_algorithm._libact_query_alg.impl.model._model

def func(list):
    list_s = []
    for i in list:
        if 'B-'+diagnosis in i or 'I-'+diagnosis in i:
            list_s.append(1)
        else:
            list_s.append(0)
    return list_s

In [16]:
from seqeval.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import f1_score as f1_score_s
from sklearn.metrics import precision_score as precision_score_s
from sklearn.metrics import recall_score as recall_score_s

seq_tagger = get_seq_tagger(active_learner)
preds = seq_tagger.predict(X_test)[0]
f1 = f1_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)


y_test_s = func(y_test)
preds_s = func(preds)

f1_s = f1_score_s(y_test_s, preds_s)
prec_s = precision_score_s(y_test_s, preds_s)
rec_s = recall_score_s(y_test_s, preds_s)


print(f'F1_score: {f1} , precision: {prec} , recall: {rec}')
print('--------\nlike a classification problem')
print(f'F1_score: {f1_s} , precision: {prec_s} , recall: {rec_s}')



F1_score: 0.7394957983193275 , precision: 0.6285714285714286 , recall: 0.8979591836734694
--------
like a classification problem
F1_score: 0.8141592920353983 , precision: 0.7076923076923077 , recall: 0.9583333333333334


In [17]:
results.append(f1)
results_s.append(f1_s)
precisions.append(prec)
precisions_s.append(prec_s)
recalls.append(rec)
recalls_s.append(rec_s)

In [18]:
print('F1_score:', results)
print('precisions:', precisions)
print('recalls:', recalls)
print('-------------\nlike a classification problem')
print('F1_score:', results_s)
print('precisions:', precisions_s)
print('recalls:', recalls_s)

F1_score: [0.7394957983193275]
precisions: [0.6285714285714286]
recalls: [0.8979591836734694]
-------------
like a classification problem
F1_score: [0.8141592920353983]
precisions: [0.7076923076923077]
recalls: [0.9583333333333334]


In [19]:
import matplotlib.pyplot as plt

plt.plot(results, label='in sequence')
plt.plot(results_s, label='classification')
plt.legend()
plt.show()

<Figure size 640x480 with 1 Axes>

In [20]:
array_of_y_test = []
array_of_preds = []
array_of_x = []
number_of_correct = 0
for i in range(len(y_test)):
    if y_test[i] == preds[i]:
        number_of_correct += 1
        
    else:
        array_of_preds.append(preds[i])
        array_of_y_test.append(y_test[i])
        array_of_x.append(X_test[i])

In [21]:
# epoch = 13
name_of_file = 'wrong_answers/'+ diagnosis + '/' + str(running) + '_' + str(epoch) + '_epoch_wrong_answers.txt'
sample = open(name_of_file, 'w') 
print("correct answers:", number_of_correct, file = sample)
print("wrong answers:", len(array_of_preds), file = sample)

print(f'F1_score: {f1} , precision: {prec} , recall: {rec}', file = sample)
print('--------\nlike a classification problem', file = sample)
print(f'F1_score: {f1_s} , precision: {prec_s} , recall: {rec_s}', file = sample)

print(file = sample)

for i_of_error in range(len (array_of_y_test)):
    a_set = set(array_of_y_test[i_of_error]) 
    b_set = set(array_of_preds[i_of_error]) 
    if ('B-'+diagnosis in a_set or 'I-'+diagnosis in a_set) and ('B-'+diagnosis in b_set or 'I-'+diagnosis in b_set):
        flag_of_class = '+++' 
    else: 
        flag_of_class = '---'
    print('Text   ||Actual||Prediction', flag_of_class, file = sample)
    for i in range(len (array_of_y_test[i_of_error])):
        if array_of_y_test[i_of_error][i] != array_of_preds[i_of_error][i]:
            print(array_of_x[i_of_error][i], "  ", array_of_y_test[i_of_error][i], "   ", array_of_preds[i_of_error][i], file = sample)

    print("actual:", array_of_y_test[i_of_error], file = sample)
    print("pred:  ", array_of_preds[i_of_error], file = sample)
    print(array_of_x[i_of_error], file = sample)
    print(file = sample)
print('F1_score:', results, file = sample)
print('precisions:', precisions, file = sample)
print('recalls:', recalls, file = sample)
print('-------------\nlike a classification problem', file = sample)
print('F1_score:', results_s, file = sample)
print('precisions:', precisions_s, file = sample)
print('recalls:', recalls_s, file = sample)
sample.close()
epoch += 1

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/opt/.pyenv/versions/3.7.4/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/.pyenv/versions/3.7.4/lib/python3.7/threading.py", line 1178, in run
    self.function(*self.args, **self.kwargs)
  File "/opt/.pyenv/versions/3.7.4/lib/python3.7/site-packages/actleto/annotator/ui_widget.py", line 116, in _save_on_timer
    self._save_answers(os.path.splitext(self._save_path)[0] + '_autosave')
  File "/opt/.pyenv/versions/3.7.4/lib/python3.7/site-packages/actleto/annotator/ui_widget.py", line 190, in _save_answers
    np.save(path, self._active_learner.get_annotation())
  File "<__array_function__ internals>", line 6, in save
  File "/opt/.pyenv/versions/3.7.4/lib/python3.7/site-packages/numpy/lib/npyio.py", line 530, in save
    fid = open(file, "wb")
FileNotFoundError: [Errno 2] No such file or directory: '../../../cache_fib/cache_autosave.npy'



# You can turn on and off some features of AL and models

In [18]:
active_learner._active_learn_algorithm._libact_query_alg.impl.model._self_training_samples = 0
active_learner._active_learn_algorithm._libact_query_alg.impl.model._n_upsample_positive = 0.4

In [26]:
active_learner._active_learn_algorithm._libact_query_alg.impl.model._n_upsample_positive = 0.