### Setup

In [1]:
%load_ext autoreload
%autoreload 2

import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm
from wikipedia2vec import Wikipedia2Vec
from src.models.GBRT import GBRT
from src.utils.GBRT import *
from src.utils import aida, cos_sim

Loading AIDA dataset...
2021-12-16 01:26:08,111 Reading data from C:\Users\athar\.flair\datasets\nel_english_aida
2021-12-16 01:26:08,132 Train: C:\Users\athar\.flair\datasets\nel_english_aida\train
2021-12-16 01:26:08,134 Dev: C:\Users\athar\.flair\datasets\nel_english_aida\testa
2021-12-16 01:26:08,135 Test: C:\Users\athar\.flair\datasets\nel_english_aida\testb


In [2]:
EMB_PATH = "C:\\Personal Files\\NED-using-KG\\embeddings\\"
wiki2vec = Wikipedia2Vec.load(EMB_PATH + 'wiki2vec_w10_100d.pkl')

### Generate training data with base features

In [3]:
def get_num_candidates(df, mention, tag):
    return df[(df['mention'] == mention) & (df['tag'] == tag)].shape[0]

In [4]:
def rank_values(df):
    dfs = []
    mentions_tags = set([(i[1], i[3]) for i in df.itertuples()])
    for mention, tag in mentions_tags:
        mention_cands = df[(df['mention'] == mention) & (df['tag'] == tag)].copy()
        scores = [(i[2], i[-4] + i[-3]) for i in mention_cands.itertuples()]
        scores.sort(key=lambda x: x[1], reverse=True)
        rank = {x:i+1 for i, (x, _) in enumerate(scores)}
        mention_cands['rank'] = mention_cands['candidate'].map(rank)
        dfs.append(mention_cands)
    return pd.concat(dfs).reset_index(drop=True)

In [5]:
dfs = []
gbrt = GBRT(wiki2vec)
for i in tqdm(range(1, 1163)):
    data = pd.read_csv(f'./data/aida/candidates/{i}.csv')
    data = data.dropna()
    mentions = data['mention'].unique()
    candidates = data['candidate'].unique()
    max_prob = get_max_prior_prob(mentions, candidates)
    
    # Base features
    data['priorProb'] = [get_prior_prob(i[1], i[2])
                         for i in data[['candidate', 'mention']].itertuples()]
    data['entityPrior'] = data['candidate'].map(get_entity_prior)
    data['maxPriorProb'] = data['candidate'].map(max_prob)
    data['numCands'] = [get_num_candidates(data, i[1], i[3])
                        for i in data.itertuples()]
    
    # String similarity features
    ment_normalised = data['mention'].map(lambda x: x.lower())
    cand_normalised = data['candidate'].map(lambda x: x.lower().replace('_', ' '))
    ment_cand = list(zip(ment_normalised, cand_normalised))
    data['editDist'] = [get_edit_dist(m, c) for m, c in ment_cand]
    data['mentionIsCand'] = [m == c for m, c in ment_cand]
    data['mentionInCand'] = [m in c for m, c in ment_cand]
    data['isStartorEnd'] = [c.startswith(m) or c.endswith(m) for m, c in ment_cand]

    # Context based features
    # Context similarity 
    context_emb = gbrt.encode_sentence(aida.get_document(i))
    data['contextSim'] = data['candidate'].map(
        lambda x: cos_sim(gbrt.encode_entity(x), context_emb))
    # Coherence score
    unamb_entities = data[data['priorProb'] >= 0.95]['candidate'].unique()
    context_ent_emb = gbrt.encode_context_entities(unamb_entities)
    data['coherence'] = data['candidate'].map(
        lambda x: cos_sim(gbrt.encode_entity(x), context_ent_emb))
    # Add rank
    # data = rank_values(data)

    # Add ground truth
    data['y'] = (data['candidate'] == data['tag']).map(int)
    dfs.append(data)

X = pd.concat(dfs).replace('NIL', np.nan).dropna()
X = X.reset_index(drop=True)
X

100%|██████████| 1162/1162 [06:26<00:00,  3.01it/s]


Unnamed: 0,mention,candidate,tag,priorProb,entityPrior,maxPriorProb,numCands,editDist,mentionIsCand,mentionInCand,isStartorEnd,contextSim,coherence,y
0,German,Germany,Germany,0.400293,4.869869e-05,0.841985,39,1,False,True,True,0.290902,0.301004,1
1,German,"German_Township,_Indiana",Germany,0.000000,3.093945e-07,0.000000,39,18,False,True,True,0.000000,0.000000,0
2,German,German_Party_(Yugoslavia),Germany,0.000000,4.331523e-07,0.000000,39,19,False,True,True,0.000000,0.000000,0
3,German,"German_Township,_Ohio",Germany,0.000000,3.093945e-07,0.000000,39,15,False,True,True,0.303123,0.123518,0
4,German,German-speaking_Community_of_Belgium,Germany,0.000047,5.135948e-06,0.000047,39,30,False,True,True,0.378137,0.466869,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194600,DHAKA,"Dhaka,_East_Champaran",Dhaka,0.000230,1.051941e-06,0.000230,11,16,False,True,True,0.000000,0.000000,0
194601,DHAKA,Dhaka_Division,Dhaka,0.018363,3.032066e-06,0.018363,11,9,False,True,True,0.460718,0.663502,0
194602,DHAKA,Old_Dhaka,Dhaka,0.000230,7.425467e-06,0.000230,11,4,False,True,True,0.439179,0.606731,0
194603,Dhaka Stock Exchange,Dhaka_Stock_Exchange,Dhaka_Stock_Exchange,1.000000,1.670730e-06,1.000000,1,0,True,True,True,0.662831,1.000000,1


### Train the GBRT Regressor

In [6]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)

In [7]:
X_train = X.drop(columns=['mention', 'candidate', 'tag', 'y']).to_numpy()
y_train = X['y'].to_numpy()

In [8]:
model = GradientBoostingRegressor(n_estimators=20000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.0750          333.60m
         2           0.0729          294.78m
         3           0.0708          291.80m
         4           0.0687          284.37m
         5           0.0668          284.34m
         6           0.0649          283.83m
         7           0.0631          280.69m
         8           0.0614          278.51m
         9           0.0598          280.71m
        10           0.0582          284.67m
        20           0.0452          276.59m
        30           0.0362          273.13m
        40           0.0300          248.11m
        50           0.0257          219.62m
        60           0.0226          200.46m
        70           0.0204          186.56m
        80           0.0189          176.13m
        90           0.0177          168.23m
       100           0.0168          161.73m
       200           0.0139          131.59m
       300           0.0131          122.58m
       40

GradientBoostingRegressor(learning_rate=0.02, max_depth=4, n_estimators=20000,
                          random_state=0, verbose=True)

In [9]:
save_model(model, './data/GBRT/coherence20.pkl')

### Testing

In [10]:
gbrt = GBRT(wiki2vec, model_path='coherence20.pkl')

In [18]:
# get all mentions and their tags
doc = 1162
mentions_tags = {i:[] for i in range(1163, 1394)}
for i in aida.aida_sets['test']:
    context = i.to_plain_string()
    if context != '-DOCSTART-':
        mentions_tags[doc].extend([[j.text, j.tag] for j in i.get_spans()])
    else:
        doc += 1

total = 0
correct = 0
all_preds = []
for i in tqdm(range(1163, 1394)):
    mentions_cands = [[mention, aida.get_candidates(i, mention, tag)]
                      for mention, tag in mentions_tags[i]]
    predictions = gbrt.link(mentions_cands, aida.get_document(i))
    for j, (mention, tag) in enumerate(mentions_tags[i]):
        m, pred, conf = predictions[j]
        assert m == mention
        all_preds.append([m, tag, pred])
        total += 1
        if pred == tag:
            correct += 1

print("Accuracy:", round((correct/total)*100, 3))

res = pd.DataFrame(all_preds, columns=['mention', 'tag', 'pred'])
res.to_csv('./results/gbrt_coherence.csv', index=False)

100%|██████████| 231/231 [11:43<00:00,  3.04s/it]

Accuracy: 88.192





### Results

|CoNLL (PPRforNED)|Accuracy|Excepted|
|----|----|----|
|Base|81.921|85.4|
|+ String similarity|82.944|85.8|
|+ Textual context|86.035|90.9|
|+ Coherence|88.326, 88.526|91.4|
|Two-step|88.192|93.1|