# Setup

In [1]:
import pickle

import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from src.GBRT import GBRT
from src.GBRT.utils import *
from src.utils import cos_sim, get_document
from tqdm import tqdm

EMB_PATH = "C:\\Personal Files\\NED-using-KG\\embeddings\\wiki2vec_w10_100d.pkl"

Loading GBRT data files...
Done.


In [7]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)

# Generate training data

In [3]:
dfs = []
gbrt = GBRT(EMB_PATH)
for i in tqdm(range(1, 1163)):
    data = pd.read_csv(f'./data/aida/candidates/{i}.csv')
    mentions = data['mention'].unique()
    candidates = data['candidate'].unique()
    max_prob = get_max_prior_prob(mentions, candidates)
    
    # Base features
    data['priorProb'] = [get_prior_prob(i[1], i[2])
                         for i in data[['candidate', 'mention']].itertuples()]
    data['entityPrior'] = data['candidate'].map(get_entity_prior)
    data['maxPriorProb'] = data['candidate'].map(max_prob)
    
    # String similarity features
    ment_normalised = data['mention'].map(lambda x: x.lower())
    cand_normalised = data['candidate'].map(lambda x: x.lower().replace('_', ' '))
    ment_cand = list(zip(ment_normalised, cand_normalised))
    data['editDist'] = [get_edit_dist(m, c) for m, c in ment_cand]
    data['mentionIsCand'] = [m == c for m, c in ment_cand]
    data['mentionInCand'] = [m in c for m, c in ment_cand]
    data['isStartorEnd'] = [c.startswith(m) or c.endswith(m) for m, c in ment_cand]

    # Context based features
    # Context similarity 
    context_emb = gbrt.encode_sentence(get_document(i))
    data['contextSim'] = data['candidate'].map(
        lambda x: cos_sim(gbrt.encode_entity(x), context_emb))
    # Coherence score
    unamb_entities = data[data['priorProb'] >= 0.95]['candidate'].unique()
    context_ent_emb = gbrt.encode_context_entities(unamb_entities)
    data['coherence'] = data['candidate'].map(
        lambda x: cos_sim(gbrt.encode_entity(x), context_ent_emb))
    # Add rank
    # data = rank_values(data)

    # Add ground truth
    data['y'] = (data['candidate'] == data['tag']).map(int)
    dfs.append(data)

X = pd.concat(dfs).reset_index(drop=True)

100%|██████████| 1162/1162 [03:03<00:00,  6.34it/s]


In [4]:
#  add rank
dfs = []
while X.shape[0] != 0:
    n = X.iloc[0]['numCands']
    temp = X.head(n).copy()
    temp['score'] = temp.contextSim	+ temp.coherence
    temp = temp.sort_values(by=['score'], ascending=False).reset_index(drop=True)
    temp['rank'] = temp.index + 1
    X = X.iloc[n:]
    dfs.append(temp)

print(len(dfs))
X = pd.concat(dfs).reset_index(drop=True)
X.to_csv('./data/GBRT/train.csv', index=False)

15593


# Train the GBRT Regressor

In [2]:
X = pd.read_csv('./data/GBRT/train.csv')
X

Unnamed: 0,mention,tag,candidate,numCands,priorProb,entityPrior,maxPriorProb,editDist,mentionIsCand,mentionInCand,isStartorEnd,contextSim,coherence,y,score,rank
0,German,Germany,German_model,39,0.000016,1.670730e-06,0.000016,6.0,0.0,1.0,1.0,0.488999,0.564448,0,1.053448,1
1,German,Germany,German_Party_(1961),39,0.000000,2.475156e-07,0.000000,13.0,0.0,1.0,1.0,0.482056,0.497495,0,0.979551,2
2,German,Germany,German_Party_(1947),39,0.000000,2.970187e-06,0.000000,13.0,0.0,1.0,1.0,0.406279,0.480351,0,0.886631,3
3,German,Germany,Elections_in_Germany,39,0.000032,1.212826e-05,0.000776,14.0,0.0,1.0,0.0,0.402251,0.447347,0,0.849598,4
4,German,Germany,German-speaking_Community_of_Belgium,39,0.000047,5.135948e-06,0.000047,30.0,0.0,1.0,1.0,0.378138,0.466869,0,0.845007,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193250,DHAKA,Dhaka,Dhaka_(village),11,0.000000,6.187889e-08,0.000000,10.0,0.0,1.0,1.0,0.000000,0.000000,0,0.000000,9
193251,DHAKA,Dhaka,Dhaka_Regency,11,0.000000,0.000000e+00,0.000000,8.0,0.0,1.0,1.0,0.000000,0.000000,0,0.000000,10
193252,DHAKA,Dhaka,"Dhaka,_East_Champaran",11,0.000230,1.051941e-06,0.000230,16.0,0.0,1.0,1.0,0.000000,0.000000,0,0.000000,11
193253,Dhaka Stock Exchange,Dhaka_Stock_Exchange,Dhaka_Stock_Exchange,1,1.000000,1.670730e-06,1.000000,0.0,1.0,1.0,1.0,0.662831,1.000000,1,1.662831,1


In [4]:
X_train = X.drop(columns=['mention', 'candidate', 'tag', 'y'])
y_train = X['y'].to_numpy()

BASE = ['priorProb', 'entityPrior', 'maxPriorProb', 'numCands']
STRING_SIM = BASE + ['editDist', 'mentionIsCand', 'mentionInCand', 'isStartorEnd']
CONTEXT = STRING_SIM + ['contextSim']
ALL = CONTEXT + ['coherence', 'rank']

In [7]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[BASE].to_numpy(), y_train)
save_model(model, './data/GBRT/base.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0722           32.34m
         2           0.0703           38.52m
         3           0.0685           38.56m
         4           0.0668           34.81m
         5           0.0651           34.32m
         6           0.0635           32.17m
         7           0.0619           31.02m
         8           0.0604           29.80m
         9           0.0590           29.27m
        10           0.0577           28.22m
        20           0.0466           25.14m
        30           0.0391           23.86m
        40           0.0341           23.46m
        50           0.0306           22.98m
        60           0.0283           22.71m
        70           0.0267           22.62m
        80           0.0255           22.56m
        90           0.0247           22.59m
       100           0.0241           22.55m
       200           0.0219           21.92m
       300           0.0212           21.32m
       40

In [8]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[STRING_SIM].to_numpy(), y_train)
save_model(model, './data/GBRT/string_sim.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0721           68.50m
         2           0.0702           66.63m
         3           0.0683           66.29m
         4           0.0665           58.52m
         5           0.0648           54.82m
         6           0.0632           51.67m
         7           0.0616           50.20m
         8           0.0600           51.28m
         9           0.0586           53.12m
        10           0.0572           53.54m
        20           0.0457           48.20m
        30           0.0380           49.75m
        40           0.0328           55.90m
        50           0.0292           58.30m
        60           0.0268           54.45m
        70           0.0251           51.37m
        80           0.0239           49.12m
        90           0.0230           47.34m
       100           0.0224           46.92m
       200           0.0200           50.82m
       300           0.0192           56.08m
       40

In [9]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[CONTEXT].to_numpy(), y_train)
save_model(model, './data/GBRT/context.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0721           94.83m
         2           0.0700           81.23m
         3           0.0681           74.16m
         4           0.0662           69.78m
         5           0.0644           65.94m
         6           0.0627           63.90m
         7           0.0610           62.46m
         8           0.0594           61.69m
         9           0.0579           60.47m
        10           0.0564           62.53m
        20           0.0443           80.70m
        30           0.0362           75.52m
        40           0.0305           70.53m
        50           0.0266           67.50m
        60           0.0239           65.25m
        70           0.0219           64.24m
        80           0.0205           62.45m
        90           0.0194           61.09m
       100           0.0186           59.92m
       200           0.0159           55.21m
       300           0.0151           58.18m
       40

In [10]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[ALL[:-1]].to_numpy(), y_train)
save_model(model, './data/GBRT/coherence_no_rank.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0720           75.00m
         2           0.0700           71.95m
         3           0.0680           72.45m
         4           0.0661           72.99m
         5           0.0642           72.82m
         6           0.0625           72.34m
         7           0.0608           79.44m
         8           0.0591           83.37m
         9           0.0576           89.38m
        10           0.0561           89.12m
        20           0.0437           79.82m
        30           0.0353           75.02m
        40           0.0294           73.22m
        50           0.0252           72.47m
        60           0.0223           71.00m
        70           0.0203           69.54m
        80           0.0188           69.17m
        90           0.0176           68.70m
       100           0.0168           68.25m
       200           0.0140           70.66m
       300           0.0131           71.92m
       40

In [6]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[ALL].to_numpy(), y_train)
save_model(model, './data/GBRT/coherence.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0719           79.34m
         2           0.0698           76.72m
         3           0.0677           74.20m
         4           0.0657           77.34m
         5           0.0638           75.44m
         6           0.0620           75.22m
         7           0.0603           74.88m
         8           0.0585           74.24m
         9           0.0569           73.34m
        10           0.0553           72.65m
        20           0.0425           72.89m
        30           0.0336           78.51m
        40           0.0276           76.61m
        50           0.0233           74.65m
        60           0.0203           73.22m
        70           0.0182           72.05m
        80           0.0167           71.05m
        90           0.0156           70.24m
       100           0.0149           69.63m
       200           0.0122           65.90m
       300           0.0115           64.23m
       40