In [1]:
import os
import numpy as np
import pandas as pd
from scipy.special import digamma
from sklearn.preprocessing import LabelEncoder

In [2]:
label_le = LabelEncoder().fit(['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'])
tag_le = LabelEncoder().fit(['LOC', 'MISC', 'ORG', 'PER', 'O'])

num_classes = len(label_le.classes_)
num_tags = len(tag_le.classes_)

In [3]:
def get_entities(labels):
    entities = []
    pre_label = 'O'
    pre_tag = 'O'
    pre_sep = 0
    for cur_idx, cur_label in enumerate(np.append(labels, 'O')):
        cur_tag = cur_label.split('-')[-1]

        if cur_tag != pre_tag or cur_label.startswith('B-'):
            if pre_tag != 'O':
                entities.append(((pre_sep, cur_idx), pre_tag))
            pre_sep = cur_idx

        pre_label = cur_label
        pre_tag = cur_tag
    return entities

def get_f1(s1, s2):
    return 2*len(s1 & s2) / (len(s1) + len(s2)) * 100

In [4]:
def mv_infer(values):
    num_classes = values.max() + 1
    num_items, num_workers = values.shape
    
    all_items = np.arange(num_items)
    z_ik = np.zeros((num_items, num_classes))

    for j in range(num_workers):
        z_ik[all_items, values[:, j]] += 1

    return z_ik

In [5]:
def bea_infer(values, alpha=1, beta_kl=1, prior=True):
    num_classes = values.max() + 1
    num_items, num_workers = values.shape
    
    beta_kl = beta_kl * np.ones((num_classes, num_classes))
    
    z_ik = mv_infer(values)
    n_jkl = np.empty((num_workers, num_classes, num_classes))
    
    last_z_ik = z_ik.copy()
    for iteration in range(500):
        Eq_log_pi_k = digamma(z_ik.sum(axis=0) + alpha) - digamma(num_items + num_classes*alpha)
        
        n_jkl[:] = beta_kl
        for j in range(num_workers):
            for k in range(num_classes):
                n_jkl[j, k, :] += np.bincount(values[:, j], z_ik[:, k], minlength=num_classes)        
        Eq_log_v_jkl = digamma(n_jkl) - digamma(n_jkl.sum(axis=-1, keepdims=True))
        
        if prior:
            z_ik[:] = Eq_log_pi_k
        else:
            z_ik.fill(0)

        for j in range(num_workers):
            z_ik += Eq_log_v_jkl[j, :, values[:, j]]
        z_ik -= z_ik.max(axis=-1, keepdims=True)
        z_ik = np.exp(z_ik)
        z_ik /= z_ik.sum(axis=-1, keepdims=True)
        
        if np.allclose(z_ik, last_z_ik, atol=1e-3):
            break
            
        last_z_ik[:] = z_ik
    return z_ik, iteration

In [6]:
a_v, b_v = 1, 1
beta_kl = np.eye(num_classes) * (a_v-b_v) + b_v
beta_kl_tag = np.eye(num_tags) * (a_v-b_v) + b_v

In [7]:
data_path = './data_conll/'
languages = ['de', 'en', 'es', 'nl']

In [8]:
records = []
for language in languages:
    dataset = language + '_test'
    df_label = pd.read_csv(data_path + dataset + '/label.csv')
    df_truth = pd.read_csv(data_path + dataset + '/truth.csv')
    
    true_entities = set(get_entities(df_truth.truth.values))
    for source in df_label.columns:
        pred_entities = set(get_entities(label_le.inverse_transform(df_label[source].values)))
        f1 = get_f1(true_entities, pred_entities)
        records.append((dataset, source, f1))
        
    z_ik = mv_infer(df_label.values)
    pred_entities = set(get_entities(label_le.inverse_transform(z_ik.argmax(axis=-1))))
    f1 = get_f1(true_entities, pred_entities)
    records.append((dataset, 'MV-tok', f1))
    
    z_ik, iteration = bea_infer(df_label.values, beta_kl=beta_kl, prior=True)
    pred_entities = set(get_entities(label_le.inverse_transform(z_ik.argmax(axis=-1))))
    f1 = get_f1(true_entities, pred_entities)
    records.append((dataset, 'BEA-tok', f1))
    
    df_range = pd.DataFrame({source: dict(get_entities(label_le.inverse_transform(df_label[source].values))) 
                             for source in df_label.columns}).fillna('O')
    values_range = np.column_stack([tag_le.transform(df_range[source]) for source in df_range.columns])
    
    z_ik = mv_infer(values_range)
    pred_entities = set([(rng, tag) for (rng, tag) 
                         in zip(df_range.index.values, tag_le.inverse_transform(z_ik.argmax(axis=-1))) if tag != 'O'])
    f1 = get_f1(true_entities, pred_entities)
    records.append((dataset, 'MV-ent', f1))
    
    z_ik, iteration = bea_infer(values_range, beta_kl=beta_kl_tag, prior=True)
    pred_entities = set([(rng, tag) for (rng, tag) 
                         in zip(df_range.index.values, tag_le.inverse_transform(z_ik.argmax(axis=-1))) if tag != 'O'])
    f1 = get_f1(true_entities, pred_entities)
    records.append((dataset, 'BEA-ent', f1))

In [9]:
df_res = pd.DataFrame.from_records(records, columns=['dataset', 'method', 'f1'])
df_pivot = df_res.pivot(index='method', columns='dataset', values='f1')

In [10]:
df_pivot.round(1)

dataset,de_test,en_test,es_test,nl_test
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BEA-ent,57.7,64.8,63.4,70.3
BEA-tok,58.2,61.2,64.7,70.1
MV-ent,57.7,64.6,69.0,70.3
MV-tok,57.4,62.1,66.4,71.0
de,,63.6,55.9,63.5
en,55.4,,65.2,68.6
es,46.2,50.3,,58.0
nl,58.4,60.7,65.2,
