# Setup

In [1]:
import pickle

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from wikipedia2vec import Wikipedia2Vec

from src.models.GBRT import GBRT
from src.utils import aida

Loading AIDA dataset...
2021-12-14 14:26:07,402 Reading data from C:\Users\athar\.flair\datasets\nel_english_aida
2021-12-14 14:26:07,409 Train: C:\Users\athar\.flair\datasets\nel_english_aida\train
2021-12-14 14:26:07,410 Dev: C:\Users\athar\.flair\datasets\nel_english_aida\testa
2021-12-14 14:26:07,411 Test: C:\Users\athar\.flair\datasets\nel_english_aida\testb


In [2]:
EMB_PATH = "C:\\Personal Files\\NED-using-KG\\embeddings\\"
wiki2vec = Wikipedia2Vec.load(EMB_PATH + 'wiki2vec_w10_100d.pkl')
gbrt = GBRT(wiki2vec)

In [3]:
BASE = ['entPrior', 'priorProb', 'maxPriorProb', 'numCands']
STRING_SIM = ['editDist', 'mentionIsTitle', 'mentionInTitle', 'mentionIsStartOrEnd']

In [4]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)


def load_model(fname):
    model = None
    with open(fname, 'rb') as f:
        model = pickle.load(f)
    return model


def get_tag(x, tags):
    try:
        return tags[x].replace('_', ' ')
    except:
        return 'NIL'


def get_feature_data(doc):
    cols = ['mention', 'cand'] + BASE + STRING_SIM + ['contextSim']
    mentions_cands = aida.get_mentions_cands(doc)
    text = aida.get_document(doc)
    tags = aida.get_mentions_tags(doc)
    X = pd.DataFrame(gbrt.rank(mentions_cands, text), columns=cols)
    X['isTag'] = [get_tag(i[1], tags) == i[2] for i in X.itertuples()]
    return X

# Train

In [6]:
X_train = pd.concat([get_feature_data(i)
                     for i in range(1, 1163)]).reset_index(drop=True)
X_train.to_csv('./data/GBRT/train_context_sim.csv', index=False)

In [5]:
X_train = pd.read_csv('./data/GBRT/train_context_sim.csv')
X_train.head()

Unnamed: 0,mention,cand,entPrior,priorProb,maxPriorProb,numCands,editDist,mentionIsTitle,mentionInTitle,mentionIsStartOrEnd,contextSim,isTag
0,EU,Euthanasia device,4e-06,0.0,0.0,18,16,False,False,False,0.402096,False
1,EU,European emission standards,4e-06,0.000241,0.000241,18,26,False,False,False,0.436891,False
2,EU,European Council,6e-06,0.000481,0.000481,18,14,False,False,False,0.490008,False
3,EU,"Eu, Seine-Maritime",3e-06,0.017324,0.017324,18,17,False,False,False,0.315598,False
4,EU,.eu,4e-06,0.000962,0.000962,18,1,False,False,True,0.374454,False


In [10]:
model = GradientBoostingClassifier(n_estimators=10000, max_depth=4,
                                   learning_rate=0.02, verbose=True)
model.fit(X_train[BASE + STRING_SIM + ['contextSim']], X_train['isTag'])

      Iter       Train Loss   Remaining Time 
         1           0.5047           61.80m
         2           0.4839           59.98m
         3           0.4663           65.27m
         4           0.4510           62.22m
         5           0.4373           60.57m
         6           0.4251           61.19m
         7           0.4139           59.18m
         8           0.4038           59.03m
         9           0.3942           58.75m
        10           0.3854           58.47m
        20           0.3207           52.96m
        30           0.2756           51.15m
        40           0.2440           50.10m
        50           0.2212           49.47m
        60           0.2035           49.06m
        70           0.1899           48.69m
        80           0.1790           48.69m
        90           0.1696           48.85m
       100           0.1626           49.01m
       200           0.1339           48.94m
       300           0.1247           48.70m
       40

GradientBoostingClassifier(learning_rate=0.02, max_depth=4, n_estimators=10000,
                           verbose=True)

In [11]:
save_model(model, './data/GBRT/context_sim.pkl')

In [6]:
model = load_model('./data/GBRT/context_sim.pkl')

# Test

In [7]:
# Get feature data for all mentions in test set
X_test = [get_feature_data(i) for i in range(1163, 1394)]
X_test = pd.concat([i for i in X_test if i.shape[0] > 0]).reset_index(drop=True)
# Only keep instances where candidate is the tag as these are the only
# instances we need to check if they get classified correctly.
X_test_true = X_test[X_test['isTag'] == True].drop_duplicates()
X_test_true.reset_index(drop=True, inplace=True)
X_test_true.head()

Unnamed: 0,mention,cand,entPrior,priorProb,maxPriorProb,numCands,editDist,mentionIsTitle,mentionInTitle,mentionIsStartOrEnd,contextSim,isTag
0,JAPAN,Japan national football team,5.2e-05,0.014707,0.014707,28,24,False,False,False,0.621944,True
1,CHINA,China national football team,2.6e-05,0.010404,0.010404,31,24,False,False,False,0.59439,True
2,AL-AIN,Al Ain,1.2e-05,0.0059,0.0059,4,3,False,False,False,0.422713,True
3,United Arab Emirates,United Arab Emirates,4e-05,0.87286,0.87286,3,3,False,True,False,0.430725,True
4,Japan,Japan national football team,5.2e-05,0.014707,0.014707,28,24,False,True,False,0.621944,True


In [8]:
y_pred = model.predict(X_test_true[BASE + STRING_SIM + ['contextSim']])
print(accuracy_score(X_test_true['isTag'], y_pred))

0.8481569157930334


# Results

| Model | Accuracy |
| ----------- | ----------- |
| Base | 0.7533368926855313 |
| Base + String Similarity | 0.7618793379604912 |
| Base + String Similarity + Context Similarity | 0.8481569157930334 |

