# Setup

In [1]:
import gc
import pickle

import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from src.gbrt import *
from src.utils import cos_sim, get_document, load_json
from tqdm.notebook import tqdm

Loading GBRT data files...
Done.


In [2]:
EMB_PATH = "C:\\Personal Files\\NED-using-KG\\embeddings\\"

features = ['priorProb', 'entityPrior', 'maxPriorProb', 'numCands',
            'editDist', 'mentionIsCand', 'mentionInCand', 'isStartorEnd',
            'contextSim', 'coherence', 'rank']

In [3]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)

# Generate training data

In [4]:
def generate_train_data(model):
    dfs = []
    for i in tqdm(range(1, 1163)):
        data = pd.read_csv(f'./data/aida/candidates/{i}.csv')
        mentions = data['mention'].unique()
        candidates = data['candidate'].unique()
        max_prob = get_max_prior_prob(mentions, candidates)
        
        # Base features
        data['priorProb'] = [get_prior_prob(i[1], i[2])
                            for i in data[['candidate', 'mention']].itertuples()]
        data['entityPrior'] = data['candidate'].map(get_entity_prior)
        data['maxPriorProb'] = data['candidate'].map(max_prob)
        
        # String similarity features
        ment_normalised = data['mention'].map(lambda x: x.lower())
        cand_normalised = data['candidate'].map(lambda x: x.lower().replace('_', ' '))
        ment_cand = list(zip(ment_normalised, cand_normalised))
        data['editDist'] = [get_edit_dist(m, c) for m, c in ment_cand]
        data['mentionIsCand'] = [m == c for m, c in ment_cand]
        data['mentionInCand'] = [m in c for m, c in ment_cand]
        data['isStartorEnd'] = [c.startswith(m) or c.endswith(m) for m, c in ment_cand]

        # Context based features
        # Context similarity 
        context_emb = model.encode_sentence(get_document(i))
        data['contextSim'] = data['candidate'].map(
            lambda x: cos_sim(model.encode_entity(x), context_emb))
        # Coherence score
        unamb_entities = data[data['priorProb'] >= 0.95]['candidate'].unique()
        context_ent_emb = model.encode_context_entities(unamb_entities)
        data['coherence'] = data['candidate'].map(
            lambda x: cos_sim(model.encode_entity(x), context_ent_emb))

        # Add ground truth
        data['y'] = (data['candidate'] == data['tag']).map(int)
        dfs.append(data)

    X = pd.concat(dfs).reset_index(drop=True)

    #  add rank
    dfs = []
    while X.shape[0] != 0:
        n = X.iloc[0]['numCands']
        temp = X.head(n).copy()
        temp['score'] = temp.contextSim	+ temp.coherence
        temp = temp.sort_values(by=['score'], ascending=False).reset_index(drop=True)
        temp['rank'] = temp.index + 1
        X = X.iloc[n:]
        dfs.append(temp)
        
    X = pd.concat(dfs).reset_index(drop=True)
    return X.drop(columns=['score'])

In [5]:
embs = [f"wiki2vec_w10_{i}d.pkl" for i in [100, 300]]
for emb in embs:
    model = GBRT(EMB_PATH + emb)
    train_df = generate_train_data(model)
    train_df.to_csv(f"./data/GBRT/{emb}_train.csv", index=False)
    model = None
    gc.collect()

  0%|          | 0/1162 [00:00<?, ?it/s]

  0%|          | 0/1162 [00:00<?, ?it/s]

In [6]:
emb = 'glove-wiki-gigaword-300'
model = GBRT2(EMB_PATH + emb)
model.entity_desc_dict = load_json("C:\\Personal Files\\NED-using-KG\\data\\aida\\entities.json")
train_df = generate_train_data(model)
train_df.to_csv(f"./data/GBRT/{emb}_train.csv", index=False)

  0%|          | 0/1162 [00:00<?, ?it/s]

# Train the GBRT (Original)

In [None]:
X = pd.read_csv("./data/GBRT/wiki2vec_w10_300d.pkl_train.csv")
X_train = X.drop(columns=['mention', 'candidate', 'tag', 'y'])
y_train = X['y'].to_numpy()

In [None]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)

In [None]:
model.fit(X_train[features[:4]].to_numpy(), y_train)
save_model(model, './data/GBRT/base.pkl')

In [None]:
model.fit(X_train[features[:8]].to_numpy(), y_train)
save_model(model, './data/GBRT/string_sim.pkl')

In [None]:
model.fit(X_train[features[:9]].to_numpy(), y_train)
save_model(model, './data/GBRT/context.pkl')

In [None]:
model.fit(X_train[features].to_numpy(), y_train)
save_model(model, './data/GBRT/coherence.pkl')

# Train GBRT

In [8]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)

embs = [f"wiki2vec_w10_{i}d.pkl" for i in [100, 300]] + ['glove-wiki-gigaword-300']
for emb in embs:
    X = pd.read_csv(f"./data/GBRT/{emb}_train.csv")
    X_train = X[features].to_numpy()
    y_train = X['y'].to_numpy()
    model.fit(X_train, y_train)
    save_model(model, f"./data/GBRT/{emb}_trained.pkl")

      Iter       Train Loss   Remaining Time 
         1           0.0719           80.88m
         2           0.0698           81.45m
         3           0.0677           81.86m
         4           0.0657           93.86m
         5           0.0638           96.06m
         6           0.0620           95.96m
         7           0.0603           93.36m
         8           0.0585           90.51m
         9           0.0569           87.68m
        10           0.0553           90.43m
        20           0.0425           83.46m
        30           0.0336           80.44m
        40           0.0276           77.08m
        50           0.0233           75.67m
        60           0.0203           76.29m
        70           0.0182           73.99m
        80           0.0167           72.14m
        90           0.0156           71.61m
       100           0.0149           71.00m
       200           0.0122           78.74m
       300           0.0115           76.20m
       40