In [1]:
import pandas as pd
import numpy as np
import os
import json
from seqeval.metrics import f1_score
from sagemaker.tensorflow import TensorFlowPredictor

### Load test set and objects

In [2]:
tag2idx = json.load( open("helpers/objects/tag2idx.json", "r"))
display(tag2idx)

{'I-geo': 0,
 'O': 1,
 'I-tim': 2,
 'I-gpe': 3,
 'I-eve': 4,
 'B-tim': 5,
 'I-nat': 6,
 'B-geo': 7,
 'B-eve': 8,
 'B-art': 9,
 'I-org': 10,
 'B-nat': 11,
 'I-art': 12,
 'I-per': 13,
 'B-org': 14,
 'B-per': 15,
 'B-gpe': 16}

In [3]:
test = pd.read_csv("bilstm_test.csv", encoding='latin', header=None, names=None)
test_y = test.iloc[:,:-50].values
test_X = test.iloc[:,50:].values

In [4]:
test_y

array([[ 1,  1,  1, ...,  1,  1,  1],
       [ 1,  1,  7, ...,  1,  1,  1],
       [16,  1,  1, ...,  1,  1,  1],
       ...,
       [14, 10, 10, ...,  1,  1,  1],
       [ 1,  1,  1, ...,  1,  1,  1],
       [ 1,  1,  1, ...,  1,  1,  1]])

### Create predictor from endpoint

In [5]:
predictor = TensorFlowPredictor('tensorflow-training-2020-05-28-12-31-07-478')

In [6]:
predictions = predictor.predict(test_X)
test_pred = np.array(predictions['predictions'])

### Decode tags

In [7]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
# test_labels = pred2label(test_y)

In [12]:
idx2tag

{0: 'I-geo',
 1: 'O',
 2: 'I-tim',
 3: 'I-gpe',
 4: 'I-eve',
 5: 'B-tim',
 6: 'I-nat',
 7: 'B-geo',
 8: 'B-eve',
 9: 'B-art',
 10: 'I-org',
 11: 'B-nat',
 12: 'I-art',
 13: 'I-per',
 14: 'B-org',
 15: 'B-per',
 16: 'B-gpe'}

In [8]:
print(pred_labels[1]);print(len(pred_labels))

['O', 'O', 'B-geo', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
4796


In [14]:
decoded_test_y = []
for i in test_y[:3]:
    decoded_sentence = []
    for j in i:
        decoded_sentence.append(idx2tag[j])
    decoded_test_y.append(decoded_sentence)

In [17]:
display(decoded_test_y[0])

['O',
 'O',
 'O',
 'B-geo',
 'I-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-org',
 'I-org',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [10]:
print(test_y[1]);print(len(test_y))

[ 1  1  7  1  1  7  1  1  1  1  1  1  1  7  1  1  1 14  1  1  1  1  1  1
  1  1  1  1 14 10 10  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1]
4796


### Compute score

In [70]:
test_f1 = f1_score(pred_labels, test_labels)
print(f"Test F1-Score: {test_f1}")