In [1]:
import pickle

import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from tqdm.notebook import tqdm

from src.gbrt import *
from src.utils import cos_sim, get_document, load_json

Loading GBRT data files...
Done.


In [2]:
EMB_PATH = "C:\\Personal Files\\NED-using-KG\\embeddings\\"
EMB_TYPE = ['word2vec-google-news-300', 'glove-wiki-gigaword-300', 
            'fasttext-wiki-news-subwords-300', 'en.wiki.bpe.vs200000.d300.w2v']
WIKI2VEC = EMB_PATH + 'wiki2vec_w10_100d.pkl'

entity_desc = load_json("C:\\Personal Files\\NED-using-KG\\data\\aida\\entities.json")

# Generate Train Data

In [3]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)


def generate_test_data(emb, is_wiki2vec=False):
    dfs = []
    gbrt = GBRT(emb, is_wiki2vec=is_wiki2vec)
    gbrt.cached_entity_desc = entity_desc
    for i in tqdm(range(1, 1163)):
        data = pd.read_csv(f'./data/aida/candidates/{i}.csv')
        mentions = data['mention'].unique()
        candidates = data['candidate'].unique()
        max_prob = get_max_prior_prob(mentions, candidates)
        
        # Base features
        data['priorProb'] = [get_prior_prob(i[1], i[2])
                            for i in data[['candidate', 'mention']].itertuples()]
        data['entityPrior'] = data['candidate'].map(get_entity_prior)
        data['maxPriorProb'] = data['candidate'].map(max_prob)
        
        # String similarity features
        ment_normalised = data['mention'].map(lambda x: x.lower())
        cand_normalised = data['candidate'].map(lambda x: x.lower().replace('_', ' '))
        ment_cand = list(zip(ment_normalised, cand_normalised))
        data['editDist'] = [get_edit_dist(m, c) for m, c in ment_cand]
        data['mentionIsCand'] = [m == c for m, c in ment_cand]
        data['mentionInCand'] = [m in c for m, c in ment_cand]
        data['isStartorEnd'] = [c.startswith(m) or c.endswith(m) for m, c in ment_cand]

        # Context based features
        # Context similarity 
        context_emb = gbrt.encode_sentence(get_document(i))
        data['contextSim'] = data['candidate'].map(
            lambda x: cos_sim(gbrt.encode_entity(x), context_emb))
        # Coherence score
        unamb_entities = data[data['priorProb'] >= 0.95]['candidate'].unique()
        context_ent_emb = gbrt.encode_context_entities(unamb_entities)
        data['coherence'] = data['candidate'].map(
            lambda x: cos_sim(gbrt.encode_entity(x), context_ent_emb))

        # Add ground truth
        data['y'] = (data['candidate'] == data['tag']).map(int)
        dfs.append(data)

    X = pd.concat(dfs).reset_index(drop=True)

    #  add rank
    dfs = []
    while X.shape[0] != 0:
        n = X.iloc[0]['numCands']
        temp = X.head(n).copy()
        temp['score'] = temp.contextSim	+ temp.coherence
        temp = temp.sort_values(by=['score'], ascending=False).reset_index(drop=True)
        temp['rank'] = temp.index + 1
        X = X.iloc[n:]
        dfs.append(temp)

    print(len(dfs))
    return pd.concat(dfs).reset_index(drop=True)

In [None]:
for emb in EMB_TYPE:
    X_train = generate_test_data(EMB_PATH + emb)
    X_train.drop(columns=['mention', 'candidate', 'tag', 'score']).to_csv(f"./data/GBRT/{emb}_train.csv", index=False)

# Train GBRT

In [12]:
for emb in EMB_TYPE[1:]:
    X = pd.read_csv(f"./data/GBRT/{emb}_train.csv")
    X_train, y_train = X.drop(columns=['y']), X['y'].to_numpy()
    model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                      max_depth=4, random_state=0, verbose=True)
    model.fit(X_train.to_numpy(), y_train)
    save_model(model, f"./data/GBRT/{emb}_trained.pkl")

      Iter       Train Loss   Remaining Time 
         1           0.0720          115.17m
         2           0.0699          116.91m
         3           0.0679          120.21m
         4           0.0660          122.16m
         5           0.0642          121.70m
         6           0.0624          119.10m
         7           0.0607          114.08m
         8           0.0591          110.52m
         9           0.0575          107.35m
        10           0.0560          104.56m
        20           0.0437           91.31m
        30           0.0352           86.10m
        40           0.0294           83.57m
        50           0.0254           82.55m
        60           0.0226           82.47m
        70           0.0206           81.40m
        80           0.0193           80.56m
        90           0.0182           81.68m
       100           0.0175           80.90m
       200           0.0150           75.63m
       300           0.0143           73.10m
       40