In [1]:
import os
import numpy as np
import pandas as pd
from scipy.special import digamma
from sklearn.preprocessing import LabelEncoder

In [2]:
label_le = LabelEncoder().fit(['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'])
tag_le = LabelEncoder().fit(['LOC', 'ORG', 'PER', 'O'])

num_classes = len(label_le.classes_)
num_tags = len(tag_le.classes_)

In [3]:
def get_entities(labels):
    entities = []
    pre_label = 'O'
    pre_tag = 'O'
    pre_sep = 0
    for cur_idx, cur_label in enumerate(np.append(labels, 'O')):
        cur_tag = cur_label.split('-')[-1]

        if cur_tag != pre_tag or cur_label.startswith('B-'):
            if pre_tag != 'O':
                entities.append(((pre_sep, cur_idx), pre_tag))
            pre_sep = cur_idx

        pre_label = cur_label
        pre_tag = cur_tag
    return entities

def get_f1(s1, s2):
    return 2*len(s1 & s2) / (len(s1) + len(s2)) * 100

In [4]:
def mv_infer(values):
    num_classes = values.max() + 1
    num_items, num_workers = values.shape
    
    all_items = np.arange(num_items)
    z_ik = np.zeros((num_items, num_classes))

    for j in range(num_workers):
        z_ik[all_items, values[:, j]] += 1

    return z_ik

In [5]:
def get_Eq_log_pi_k_and_Eq_log_v_jkl(values, z_ik, alpha_k=1, beta_kl=1):
    num_items, num_workers = values.shape
    num_classes = z_ik.shape[1]
    
    alpha_k = alpha_k * np.ones(num_classes)
    beta_kl = beta_kl * np.ones((num_classes, num_classes))
    
    Eq_log_pi_k = digamma(z_ik.sum(axis=0) + alpha_k) - digamma(num_items + alpha_k.sum())

    n_jkl = np.zeros((num_workers, num_classes, num_classes)) + beta_kl
    for j in range(num_workers):
        for k in range(num_classes):
            n_jkl[j, k, :] += np.bincount(values[:, j], z_ik[:, k], minlength=num_classes)
    Eq_log_v_jkl = digamma(n_jkl) - digamma(n_jkl.sum(axis=-1, keepdims=True))

    return Eq_log_pi_k, Eq_log_v_jkl

def get_z_ik(values, Eq_log_v_jkl, Eq_log_pi_k=None, prior=False):
    num_items, num_workers = values.shape
    num_classes = Eq_log_v_jkl.shape[1]
    
    z_ik = np.zeros((num_items, num_classes))
    if prior:
        z_ik += Eq_log_pi_k

    for j in range(num_workers):
        z_ik += Eq_log_v_jkl[j, :, values[:, j]]
    z_ik -= z_ik.max(axis=-1, keepdims=True)
    z_ik = np.exp(z_ik)
    z_ik /= z_ik.sum(axis=-1, keepdims=True)
    
    return z_ik

def bea_infer(values, alpha_k=1, beta_kl=1, prior=True):
    z_ik = mv_infer(values)
    for iteration in range(500):
        Eq_log_pi_k, Eq_log_v_jkl = get_Eq_log_pi_k_and_Eq_log_v_jkl(values, z_ik, alpha_k, beta_kl)
        
        last_z_ik = z_ik
        z_ik = get_z_ik(values, Eq_log_v_jkl, Eq_log_pi_k, prior)
        
        if np.allclose(z_ik, last_z_ik, atol=1e-3):
            break
    return z_ik, Eq_log_v_jkl, Eq_log_pi_k, iteration

In [6]:
def get_entities_from_tok_results(z_ik):
    return set(get_entities(label_le.inverse_transform(z_ik.argmax(axis=-1))))

def get_entities_from_ent_results(z_ik, df_range):
    df = pd.DataFrame(z_ik, index=df_range.index.set_names(['beg', 'end']), columns=pd.Series(tag_le.classes_, name='tag'))
    df = df.stack().rename('prob').reset_index().sort_values('prob', ascending=False).drop_duplicates(['beg', 'end'])
    num_items = df.end.max()
    df = df[df['tag'] != 'O']
    
    pred_entities = set()
    occupied = np.zeros(num_items)
    for beg, end, tag, prob in df.values:
        if occupied[beg:end].sum() == 0:
            occupied[beg:end] = 1
            pred_entities.add(((beg, end), tag))
    return pred_entities

def mv_tok(df_label):
    z_ik = mv_infer(df_label.values)
    return get_entities_from_tok_results(z_ik)

def bea_tok(df_label, **kwargs):
    z_ik, Eq_log_v_jkl, Eq_log_pi_k, iteration = bea_infer(df_label.values, **kwargs)
    return get_entities_from_tok_results(z_ik), Eq_log_v_jkl, Eq_log_pi_k, iteration

def mv_ent(df_range):
    z_ik = mv_infer(df_range.values)
    return get_entities_from_ent_results(z_ik, df_range)

def bea_ent(df_range, **kwargs):
    z_ik, Eq_log_v_jkl, Eq_log_pi_k, iteration = bea_infer(df_range.values, **kwargs)
    return get_entities_from_ent_results(z_ik, df_range), Eq_log_v_jkl, Eq_log_pi_k, iteration

In [7]:
def get_df_range(df_label):
    return pd.DataFrame({source: dict(get_entities(label_le.inverse_transform(df_label[source].values)))
                         for source in df_label.columns}).fillna('O').apply(tag_le.transform)

In [8]:
def get_df_recall(Eq_log_v_jkl, sources):
    v_jkl = np.exp(Eq_log_v_jkl)
    v_jkl /= v_jkl.sum(axis=-1, keepdims=True)

    df_recall = pd.DataFrame(v_jkl[:, np.arange(num_tags), np.arange(num_tags)], columns=tag_le.classes_)
    df_recall['source'] = sources
    df_recall['Avg3'] = df_recall[['LOC', 'ORG', 'PER']].mean(axis=1)
    
    return df_recall

In [9]:
data_path = './data_wiki/'

languages = ['af']
# languages = ['af', 'ar', 'bg', 'bn', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hi',
#              'hr', 'hu', 'id', 'it', 'lt', 'lv', 'mk', 'ms', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sv',
#              'ta', 'tl', 'tr', 'uk', 'vi']

Supervised setting (use the first 100 sentences in dev as gold data)

In [10]:
records = []
for language in languages:
    dataset = language + '_test'
    df_label = pd.read_csv(data_path + dataset + '/label.csv')
    df_truth = pd.read_csv(data_path + dataset + '/truth.csv')
    true_entities = set(get_entities(df_truth.truth.values))
        
    dataset_dev = language + '_dev'
    df_label_dev = pd.read_csv(data_path + dataset_dev + '/label.csv')
    df_truth_dev = pd.read_csv(data_path + dataset_dev + '/truth.csv')
    
    df_label_gold = df_label_dev[df_truth_dev.sent_idx<100].copy()
    df_truth_gold = df_truth_dev[df_truth_dev.sent_idx<100]
    
    # get Eq_log_pi_k and Eq_log_v_jkl from gold sentences (ent level)
    df_label_gold['#truth'] = label_le.transform(df_truth_gold.truth)
    df_range_gold = get_df_range(df_label_gold)

    truth_gold = df_range_gold['#truth']
    df_range_gold.drop('#truth', axis=1, inplace=True)

    one_hot_truth_gold = np.zeros((df_range_gold.shape[0], num_tags))
    one_hot_truth_gold[np.arange(df_range_gold.shape[0]), truth_gold] = 1

    # ent level Eq_log_pi_k and Eq_log_v_jkl
    Eq_log_pi_k, Eq_log_v_jkl = get_Eq_log_pi_k_and_Eq_log_v_jkl(df_range_gold.values, one_hot_truth_gold)
    
    # rank
    df_recall = get_df_recall(Eq_log_v_jkl, df_range_gold.columns).sort_values('Avg3', ascending=False)
    
    for topK in [3, 10, 20]:
        # mv-tok-sup
        pred_entities = mv_tok(df_label[df_recall.source[:topK]])
        records.append((dataset, 'MV-tok-sup-t%d'%topK, get_f1(true_entities, pred_entities)))

        # mv-ent-sup
        df_range = get_df_range(df_label[df_recall.source[:topK]])
        pred_entities = mv_ent(df_range)
        records.append((dataset, 'MV-ent-sup-t%d'%topK, get_f1(true_entities, pred_entities)))

        # bcc-ent-sup
        df_range = get_df_range(df_label[df_recall.source[:topK]])
        z_ik = get_z_ik(df_range.values, Eq_log_v_jkl[df_recall.index[:topK]], Eq_log_pi_k, prior=True)

        pred_entities = get_entities_from_ent_results(z_ik, df_range)
        records.append((dataset, 'BEA-ent-sup-t%d'%topK, get_f1(true_entities, pred_entities)))

Unsupervised setting

In [11]:
a_v, b_v = 1, 1
beta_kl = np.eye(num_classes) * (a_v-b_v) + b_v
beta_kl_tag = np.eye(num_tags) * (a_v-b_v) + b_v

In [12]:
# records = [] # we already have 'records' of supervised setting
for language in languages:
    dataset = language + '_test'
    df_label = pd.read_csv(data_path + dataset + '/label.csv')
    df_truth = pd.read_csv(data_path + dataset + '/truth.csv')
    
    true_entities = set(get_entities(df_truth.truth.values))
    for source in df_label.columns:
        pred_entities = set(get_entities(label_le.inverse_transform(df_label[source].values)))
        f1 = get_f1(true_entities, pred_entities)
        records.append((dataset, source, f1))
        
    # token level
    pred_entities = mv_tok(df_label)
    records.append((dataset, 'MV-tok', get_f1(true_entities, pred_entities)))
    
    pred_entities = bea_tok(df_label, beta_kl=beta_kl, prior=True)[0]
    records.append((dataset, 'BEA-tok', get_f1(true_entities, pred_entities)))
    
    # entity level
    df_range = get_df_range(df_label)
    
    pred_entities = mv_ent(df_range)
    records.append((dataset, 'MV-ent', get_f1(true_entities, pred_entities)))
    
    pred_entities, Eq_log_v_jkl = bea_ent(df_range, beta_kl=beta_kl_tag, prior=True)[:2]
    records.append((dataset, 'BEA-ent', get_f1(true_entities, pred_entities)))
    
    # spammer removel
    # round 1, pick top 20
    df_recall = get_df_recall(Eq_log_v_jkl, df_range.columns).sort_values('Avg3', ascending=False)
    
    df_range = get_df_range(df_label[df_recall.source[:20]])
    pred_entities, Eq_log_v_jkl = bea_ent(df_range, beta_kl=beta_kl_tag, prior=True)[:2]
    records.append((dataset, 'BEA-ent-x1-t20', get_f1(true_entities, pred_entities)))
    
    # round 2, pick top 10
    df_recall = get_df_recall(Eq_log_v_jkl, df_range.columns).sort_values('Avg3', ascending=False)
    
    df_range = get_df_range(df_label[df_recall.source[:10]])
    pred_entities, Eq_log_v_jkl = bea_ent(df_range, beta_kl=beta_kl_tag, prior=True)[:2]
    records.append((dataset, 'BEA-ent-x2-t10', get_f1(true_entities, pred_entities)))

In [13]:
df_res = pd.DataFrame.from_records(records, columns=['dataset', 'method', 'f1'])
df_pivot = df_res.pivot(index='dataset', columns='method', values='f1')

In [14]:
df_pivot.agg(['mean']).T.sort_values('mean', ascending=False)

Unnamed: 0,mean
BEA-ent-sup-t3,81.365005
MV-ent-sup-t3,81.20603
MV-ent-sup-t10,80.273973
BEA-ent-sup-t10,80.25224
MV-tok-sup-t3,79.803601
nl,79.78688
BEA-ent-x1-t20,79.609033
BEA-ent-sup-t20,79.494008
BEA-ent-x2-t10,79.439891
MV-ent-sup-t20,79.257183
