In [1]:
import os
from src.neural_baseline import *
from src.utils.conlleval import *

print(f'Current working directory: {os.getcwd()}')
parent_dir = os.path.dirname(os.getcwd())
print(f'Parent directory: {parent_dir}')
os.chdir(parent_dir)
print(f'Current working directory: {os.getcwd()}')
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload

# notebook will reload external python modules;
%autoreload 2 

Current working directory: d:\azeem\Documents\UNIGE\MSc CS\Semester IV\METL\ner_seq2seq_project\notebooks
Parent directory: d:\azeem\Documents\UNIGE\MSc CS\Semester IV\METL\ner_seq2seq_project
Current working directory: d:\azeem\Documents\UNIGE\MSc CS\Semester IV\METL\ner_seq2seq_project


#### util functions

In [2]:
def calculate_metrics(dataset, ner_model, mapping):
    all_true_tag_ids, all_predicted_tag_ids = [], []
    
    for x, y in dataset:
        output = ner_model.predict(x, verbose=0)  # set verbose to 0
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]
    
    res = evaluate(real_tags, predicted_tags, verbose = True)

    return res

## Baseline n°1: Neural Network Implementation with Keras

In [3]:
vocab_size = 20000
batch_size = 32
epochs = 10
sample_text = "eu rejects german call to boycott british lamb"


print(f"processing data and preparing vocabulary of size {vocab_size}...")    
conll_data = load_and_prepare_data()

mapping = make_tag_lookup_table()

# vocab_size = 20000
vocabulary = get_vocabulary(conll_data, vocab_size)

print(f"preparing datasets...")
lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)

# batch_size = 32
train_dataset, val_dataset = prepare_datasets(vocabulary, batch_size)

num_tags = len(mapping)

print(f"creating model...\n")
ner_model = create_model(num_tags, vocab_size)

print(f"training model...\n")
compile_and_fit(ner_model, train_dataset, epochs=epochs)

processing data and preparing vocabulary of size 20000...


Found cached dataset conll2003 (C:/Users/azeem/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

preparing datasets...
creating model...

training model...

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [4]:
print(predict_sample(ner_model, sample_text, mapping, lookup_layer))

print(f"\ncalculating metrics...\n")
res = calculate_metrics(val_dataset, ner_model, mapping)

# res is a tuple of (precision, recall, f1), print it out beautifully
print("\n")
print(f"precision: \t{res[0]:.2f}")
print(f"   recall: \t{res[1]:.2f}")
print(f"       f1: \t{res[2]:.2f}")

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O']

calculating metrics...

processed 51362 tokens with 5942 phrases; found: 5147 phrases; correct: 3845.
accuracy:  61.61%; (non-O)
accuracy:  93.29%; precision:  74.70%; recall:  64.71%; FB1:  69.35
              LOC: precision:  82.26%; recall:  81.00%; FB1:  81.62  1809
             MISC: precision:  74.94%; recall:  67.14%; FB1:  70.82  826
              ORG: precision:  69.29%; recall:  57.20%; FB1:  62.66  1107
              PER: precision:  69.11%; recall:  52.71%; FB1:  59.81  1405


precision: 	74.70
   recall: 	64.71
       f1: 	69.35


## Baseline n°2: CRF Implementation with sklearn

In [5]:
from src.crf_baseline import * 
from IPython.display import display

# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload

# notebook will reload external python modules;
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def read_raw_input(filename):
    """Read a train/test file and return the contents as a list of list of lists. 
    
    The innermost list is a record of 4 items, one per word.
    The middle-level list contains all the records in one sentence.
    """

    all_items = []

    with open(filename) as fh:
        current_item = []
        all_items.append(current_item)

        for line in fh:
            tags = line.strip().split()
            if len(tags) == 0 or tags[0] == '-DOCSTART-':
                continue
            current_item.append(tags)
            if tags[0] == '.' and tags[1] == '.':
                current_item = []
                all_items.append(current_item)
                
    return all_items

train_sents = read_raw_input('./data/CoNLL-2003_train.txt')
test_sents = read_raw_input('./data/CoNLL-2003_test.txt')

display(train_sents[0])

[['EU', 'NNP', 'B-NP', 'B-ORG'],
 ['rejects', 'VBZ', 'B-VP', 'O'],
 ['German', 'JJ', 'B-NP', 'B-MISC'],
 ['call', 'NN', 'I-NP', 'O'],
 ['to', 'TO', 'B-VP', 'O'],
 ['boycott', 'VB', 'I-VP', 'O'],
 ['British', 'JJ', 'B-NP', 'B-MISC'],
 ['lamb', 'NN', 'I-NP', 'O'],
 ['.', '.', 'O', 'O']]

In [7]:
train_sents = all_sentences(train_sents)
test_sents  = all_sentences(test_sents)
train_sents[0]

Unnamed: 0_level_0,word,pos,parse,ner
word_seq_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O
5,boycott,VB,I-VP,O
6,British,JJ,B-NP,B-MISC
7,lamb,NN,I-NP,O
8,.,.,O,O


In [8]:
X_train = get_feature_values(train_sents)
X_test = get_feature_values(test_sents)
y_train, y_test = get_labels(train_sents), get_labels(test_sents)

100%|██████████| 7375/7375 [02:50<00:00, 43.17it/s]
100%|██████████| 1627/1627 [00:38<00:00, 41.98it/s]


In [9]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    verbose=False,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

labels = list(crf.classes_)
labels.remove('O')
display(labels)

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

CPU times: total: 37.3 s
Wall time: 43.2 s


In [10]:
y_pred = crf.predict(X_test)
f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
print(f" flat f1 score: {f1_score:.2f}")

report = calculate_metrics_crf(y_test, y_pred, labels)
print(f"{report}")

 flat f1 score: 0.81
              precision    recall  f1-score   support

       B-LOC      0.870     0.839     0.854      1668
       I-LOC      0.801     0.720     0.758       257
      B-MISC      0.800     0.748     0.773       702
      I-MISC      0.628     0.657     0.643       216
       B-ORG      0.802     0.723     0.761      1661
       I-ORG      0.655     0.734     0.692       835
       B-PER      0.829     0.853     0.841      1617
       I-PER      0.867     0.947     0.905      1156

   micro avg      0.809     0.806     0.808      8112
   macro avg      0.782     0.778     0.778      8112
weighted avg      0.811     0.806     0.807      8112

