# Visualization of F1 score as a function of span separation distance

In [28]:
import h5py
import numpy as np
import bokeh
from keras.models import load_model
from sklearn.metrics import f1_score
from bokeh.plotting import figure, output_file, show, output_notebook

In [2]:
test_data_path = "/home/users/pkahardipraja/data/coref/data-bert-base-128/test_span_reps_joshi.128_1.h5"
with h5py.File(test_data_path, 'r') as f:
    test_data = f.get('span_representations').value
    x_test = test_data[:, :-2]
    y_test = test_data[:, -1].astype(int)
    x_dist = test_data[:, -2].astype(int)

In [3]:
unique_elements, count_elements = np.unique(x_dist, return_counts=True)
# To make the result more accurate, take examples where there are only more than 10 examples.
filtered_dist = unique_elements[np.where(count_elements >= 10)]
print(filtered_dist)  # Max is 476 for test set, set upper limit to 500, then bucket examples for 20 bars, so range is 1-25, 26-50, etc..

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 104 105 107 108 109
 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
 128 129 130 131 132 133 134 136 137 139 140 141 142 144 145 146 147 148
 149 150 151 153 154 155 157 158 159 160 161 163 164 165 166 167 168 169
 171 173 174 175 177 178 179 181 182 184 185 186 187 188 189 190 191 192
 193 194 195 199 200 201 202 204 205 206 207 212 213 214 217 219 220 222
 223 224 226 227 228 229 230 231 233 234 236 241 242 246 247 249 250 252
 253 260 261 265 270 271 272 276 279 280 292 296 304 306 323 336 348 435
 476]


In [44]:
model_path = "/project/personal/bert_for_coreference_resolution/log/test_bert_base_joshi_128.h5"
model = load_model(model_path)

lower_dist = 0
upper_dist = 25
bucket_dist = 25
token_dist = test_data[:, -2].astype(int)
acc_list = []
f1_list = []
example_list = []

while (upper_dist <= 500):
    idx =np.intersect1d(np.where(test_data[:, -2].astype(int) >= lower_dist), np.where(test_data[:, -2].astype(int) <= upper_dist))
    filtered_data = test_data[idx]
    features = filtered_data[:, :-2]
    label = filtered_data[:, -1].astype(int)
    _, score = model.evaluate(features, label, batch_size=features.shape[0])
    prediction = (np.asarray(model.predict(features))).round()
    f1 = f1_score(label, prediction)
    acc_list.append(score)
    f1_list.append(f1)
    example_list.append(len(idx))
    lower_dist = upper_dist + 1
    upper_dist += bucket_dist

print(acc_list, len(acc_list), "\n")
print(f1_list, len(f1_list), "\n")
print(example_list)

[0.9363877177238464, 0.9287257194519043, 0.9120520949363708, 0.9068901538848877, 0.9287531971931458, 0.9083094596862793, 0.9085366129875183, 0.9189189076423645, 0.9015747904777527, 0.8951612710952759, 0.9405940771102905, 0.9166666865348816, 0.8875739574432373, 0.9426751732826233, 0.9051094651222229, 0.8999999761581421, 0.9379844665527344, 0.9173553586006165, 0.9120879173278809, 0.931034505367279] 20 

[0.961218836565097, 0.9423076923076923, 0.9093959731543624, 0.8883928571428572, 0.8985507246376812, 0.8778625954198473, 0.8611111111111112, 0.8588235294117647, 0.822695035460993, 0.8374999999999999, 0.8983050847457628, 0.8521739130434783, 0.759493670886076, 0.896551724137931, 0.7719298245614036, 0.7812500000000001, 0.8857142857142858, 0.8648648648648648, 0.7500000000000001, 0.8461538461538461] 20 

[2641, 926, 614, 537, 393, 349, 328, 296, 254, 248, 202, 204, 169, 157, 137, 140, 129, 121, 91, 116]


In [43]:
# Use F1 score instead of accuracy as metrics (more interesting and can capture type I and type II error)
FONT_SIZE="13pt"
x = [i for i in range(20)]
output_notebook()
crs = []
p = figure(x_axis_label='Distance between mention pair (wordpiece tokens)', y_axis_label='F1 Score', y_range=[0.5, 1.05], width=900, height=400)
p.line(x, f1_list, color="green")
cr = p.scatter(x=x, y=f1_list, size=12, hover_fill_color="Gray", legend="BERT-base c2f", marker="diamond", color="green")
crs.append(cr)
tooltips = [("Height", "@x"),("F1 score", "@y{0.00}"),]
p.add_tools(bokeh.models.HoverTool(tooltips=tooltips, renderers=crs))
p.xaxis.ticker = [i for i in range(20) if i%2==0]
p.yaxis.major_label_text_font_size = FONT_SIZE
p.xaxis.major_label_text_font_size = FONT_SIZE
p.xaxis.major_label_text_font_size = FONT_SIZE
p.xaxis.axis_label_text_font_size = FONT_SIZE
p.yaxis.axis_label_text_font_size = FONT_SIZE
p.xaxis.major_label_overrides = { 0: '0-25', 2: '51-75', 4: '101-125', 6:'151-175', 8:'201-225', 10:'251-275', 12:'301-325', 14:'351-375', 16:'401-425', 18:'451-475'}
show(p)