# Setup

In [1]:
import os
import gc
import pickle

import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from tqdm.notebook import tqdm

from src.gbrt import GBRT
from src.transformer import GBRT_TRF
from src.gbrt import get_edit_dist, get_entity_prior, get_max_prior_prob, get_prior_prob
from src.utils import cosine_similarity, get_document, load_pickle

In [2]:
EMB_PATH = os.path.join(os.getcwd(), 'embeddings')

features = ['priorProb', 'entityPrior', 'maxPriorProb', 'numCands',
            'editDist', 'mentionIsCand', 'mentionInCand', 'isStartorEnd',
            'contextSim', 'coherence', 'rank']

embs  = ["wiki2vec_w10_100d.pkl", "wiki2vec_w10_300d.pkl", 
         "word2vec-google-news-300", "glove-wiki-gigaword-300",
         "fasttext-wiki-news-subwords-300"]

entities = load_pickle('./data/aida/entities.pkl')
entities_filtered = load_pickle('./data/aida/entities_filtered.pkl')

# Generate training data

In [3]:
def generate_train_data(model):
    dfs = []
    for i in tqdm(range(1, 1163)):
        data = pd.read_csv(f'./data/aida/candidates/{i}.csv')
        mentions = data['mention'].unique()
        candidates = data['candidate'].unique()
        max_prob = get_max_prior_prob(mentions, candidates)
        
        # Base features
        data['priorProb'] = [get_prior_prob(i[1], i[2])
                            for i in data[['candidate', 'mention']].itertuples()]
        data['entityPrior'] = data['candidate'].map(get_entity_prior)
        data['maxPriorProb'] = data['candidate'].map(max_prob)
        
        # String similarity features
        ment_normalised = data['mention'].map(lambda x: x.lower())
        cand_normalised = data['candidate'].map(lambda x: x.lower().replace('_', ' '))
        ment_cand = list(zip(ment_normalised, cand_normalised))
        data['editDist'] = [get_edit_dist(m, c) for m, c in ment_cand]
        data['mentionIsCand'] = [int(m == c) for m, c in ment_cand]
        data['mentionInCand'] = [int(m in c) for m, c in ment_cand]
        data['isStartorEnd'] = [int(c.startswith(m) or c.endswith(m)) for m, c in ment_cand]

        # Context based features
        # Context similarity 
        context_emb = model.encode_sentence(get_document(i))
        data['contextSim'] = data['candidate'].map(lambda x: cosine_similarity(model.encode_entity(x), context_emb))
        # Coherence score
        unamb_entities = data[data['priorProb'] >= 0.95]['candidate'].unique()
        context_ent_emb = model.encode_context_entities(unamb_entities)
        data['coherence'] = data['candidate'].map(lambda x: cosine_similarity(model.encode_entity(x), context_ent_emb))

        # Add ground truth
        data['y'] = (data['candidate'] == data['tag']).map(int)
        dfs.append(data)

    X = pd.concat(dfs).reset_index(drop=True)

    #  add rank
    dfs = []
    while X.shape[0] != 0:
        n = X.iloc[0]['numCands']
        temp = X.head(n).copy()
        temp['score'] = temp.contextSim	+ temp.coherence
        temp = temp.sort_values(by=['score'], ascending=False).reset_index(drop=True)
        temp['rank'] = temp.index + 1
        X = X.iloc[n:]
        dfs.append(temp)
        
    X = pd.concat(dfs).reset_index(drop=True)
    return X.drop(columns=['score'])

In [4]:
for emb in embs:
    model = GBRT(os.path.join(EMB_PATH, emb), cased = 'word2vec' in emb)
    model.entity_desc_dict = entities_filtered
    train_df = generate_train_data(model)
    train_df.to_csv(f"./data/GBRT/{emb}_train.csv", index=False)
    model = None
    gc.collect()

  0%|          | 0/1162 [00:00<?, ?it/s]

  0%|          | 0/1162 [00:00<?, ?it/s]

  0%|          | 0/1162 [00:00<?, ?it/s]

  0%|          | 0/1162 [00:00<?, ?it/s]

  0%|          | 0/1162 [00:00<?, ?it/s]

In [5]:
model = GBRT_TRF()
model.entity_desc_dict = entities
train_df = generate_train_data(model)
train_df.to_csv(f"./data/GBRT/TRF_train.csv", index=False)

  0%|          | 0/1162 [00:00<?, ?it/s]

# Train the GBRT (Original)

In [6]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)

model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)

In [7]:
X = pd.read_csv(f"./data/GBRT/{embs[0]}_train.csv")
y_train = X['y'].to_numpy()

## Baseline GBRT models

In [9]:
model.fit(X[features[:4]].to_numpy(), y_train)
save_model(model, './data/GBRT/base.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0722           29.33m
         2           0.0703           30.08m
         3           0.0685           29.21m
         4           0.0668           31.40m
         5           0.0651           32.69m
         6           0.0635           34.41m
         7           0.0619           36.15m
         8           0.0604           34.92m
         9           0.0590           35.16m
        10           0.0577           35.27m
        20           0.0466           36.53m
        30           0.0391           35.23m
        40           0.0341           34.07m
        50           0.0306           32.36m
        60           0.0283           31.48m
        70           0.0267           31.18m
        80           0.0255           31.00m
        90           0.0247           30.73m
       100           0.0241           30.56m
       200           0.0219           30.97m
       300           0.0212           31.31m
       40

In [10]:
model.fit(X[features[:8]].to_numpy(), y_train)
save_model(model, './data/GBRT/string_sim.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0721           42.77m
         2           0.0702           44.71m
         3           0.0683           43.13m
         4           0.0665           46.09m
         5           0.0648           50.56m
         6           0.0632           51.01m
         7           0.0616           49.95m
         8           0.0600           50.11m
         9           0.0586           49.33m
        10           0.0572           49.51m
        20           0.0457           42.75m
        30           0.0380           41.20m
        40           0.0328           40.10m
        50           0.0292           39.42m
        60           0.0268           38.68m
        70           0.0251           38.43m
        80           0.0239           38.09m
        90           0.0230           37.95m
       100           0.0224           37.67m
       200           0.0200           36.14m
       300           0.0192           35.35m
       40

In [11]:
model.fit(X[features[:9]].to_numpy(), y_train)
save_model(model, './data/GBRT/context.pkl')

      Iter       Train Loss   Remaining Time 
         1           0.0720           60.83m
         2           0.0700           60.82m
         3           0.0681           61.57m
         4           0.0662           67.20m
         5           0.0644           65.52m
         6           0.0626           63.92m
         7           0.0610           62.74m
         8           0.0594           61.55m
         9           0.0578           60.66m
        10           0.0564           60.03m
        20           0.0443           55.39m
        30           0.0361           54.13m
        40           0.0305           53.65m
        50           0.0265           53.31m
        60           0.0237           52.88m
        70           0.0218           52.55m
        80           0.0204           52.31m
        90           0.0194           51.99m
       100           0.0185           51.63m
       200           0.0158           49.69m
       300           0.0149           48.68m
       40

## Word vector based models

In [7]:
for emb in embs[2:]:
    X = pd.read_csv(f"./data/GBRT/{emb}_train.csv")
    model.fit(X[features].to_numpy(), X['y'].to_numpy())
    save_model(model, f"./data/GBRT/{emb}_trained.pkl")

      Iter       Train Loss   Remaining Time 
         1           0.0718           72.31m
         2           0.0696           71.79m
         3           0.0675           71.80m
         4           0.0654           76.97m
         5           0.0634           80.97m
         6           0.0615           80.77m
         7           0.0597           81.86m
         8           0.0579           82.52m
         9           0.0562           82.05m
        10           0.0546           81.81m
        20           0.0415           81.99m
        30           0.0326           79.43m
        40           0.0265           78.02m
        50           0.0223           77.20m
        60           0.0194           76.51m
        70           0.0174           76.04m
        80           0.0159           75.83m
        90           0.0149           75.46m
       100           0.0142           75.20m
       200           0.0119           72.38m
       300           0.0113           71.01m
       40

## Bert Based Model

In [7]:
X = pd.read_csv(f"./data/GBRT/TRF_train.csv")
model.fit(X[features].to_numpy(), X['y'].to_numpy())
save_model(model, f"./data/GBRT/TRF_trained.pkl")

      Iter       Train Loss   Remaining Time 
         1           0.0720           68.00m
         2           0.0699           64.11m
         3           0.0679           63.64m
         4           0.0660           62.79m
         5           0.0642           62.77m
         6           0.0624           62.56m
         7           0.0607           62.01m
         8           0.0591           62.11m
         9           0.0575           61.89m
        10           0.0561           61.70m
        20           0.0439           60.86m
        30           0.0357           60.63m
        40           0.0301           60.35m
        50           0.0262           60.30m
        60           0.0235           60.17m
        70           0.0215           60.06m
        80           0.0201           60.00m
        90           0.0191           59.96m
       100           0.0184           59.89m
       200           0.0157           61.72m
       300           0.0149           61.41m
       40