# Setup

In [1]:
import os
import gc
import pickle

import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from tqdm.notebook import tqdm

from src.gbrt import (GBRT, get_edit_dist, get_entity_prior,
                      get_max_prior_prob, get_prior_prob)
from src.utils import cos_sim, get_document, load_json

In [2]:
EMB_PATH = os.path.join("wiki2vec_w10_100d.pkl", 'embeddings')

features = ['priorProb', 'entityPrior', 'maxPriorProb', 'numCands',
            'editDist', 'mentionIsCand', 'mentionInCand', 'isStartorEnd',
            'contextSim', 'coherence', 'rank']

In [3]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)

# Generate training data

In [None]:
def generate_train_data(model):
    dfs = []
    for i in tqdm(range(1, 1163)):
        data = pd.read_csv(f'./data/aida/candidates/{i}.csv')
        mentions = data['mention'].unique()
        candidates = data['candidate'].unique()
        max_prob = get_max_prior_prob(mentions, candidates)
        
        # Base features
        data['priorProb'] = [get_prior_prob(i[1], i[2])
                            for i in data[['candidate', 'mention']].itertuples()]
        data['entityPrior'] = data['candidate'].map(get_entity_prior)
        data['maxPriorProb'] = data['candidate'].map(max_prob)
        
        # String similarity features
        ment_normalised = data['mention'].map(lambda x: x.lower())
        cand_normalised = data['candidate'].map(lambda x: x.lower().replace('_', ' '))
        ment_cand = list(zip(ment_normalised, cand_normalised))
        data['editDist'] = [get_edit_dist(m, c) for m, c in ment_cand]
        data['mentionIsCand'] = [m == c for m, c in ment_cand]
        data['mentionInCand'] = [m in c for m, c in ment_cand]
        data['isStartorEnd'] = [c.startswith(m) or c.endswith(m) for m, c in ment_cand]

        # Context based features
        # Context similarity 
        context_emb = model.encode_sentence(get_document(i))
        data['contextSim'] = data['candidate'].map(
            lambda x: cos_sim(model.encode_entity(x), context_emb))
        # Coherence score
        unamb_entities = data[data['priorProb'] >= 0.95]['candidate'].unique()
        context_ent_emb = model.encode_context_entities(unamb_entities)
        data['coherence'] = data['candidate'].map(
            lambda x: cos_sim(model.encode_entity(x), context_ent_emb))

        # Add ground truth
        data['y'] = (data['candidate'] == data['tag']).map(int)
        dfs.append(data)

    X = pd.concat(dfs).reset_index(drop=True)

    #  add rank
    dfs = []
    while X.shape[0] != 0:
        n = X.iloc[0]['numCands']
        temp = X.head(n).copy()
        temp['score'] = temp.contextSim	+ temp.coherence
        temp = temp.sort_values(by=['score'], ascending=False).reset_index(drop=True)
        temp['rank'] = temp.index + 1
        X = X.iloc[n:]
        dfs.append(temp)
        
    X = pd.concat(dfs).reset_index(drop=True)
    return X.drop(columns=['score'])

In [None]:
embs  = [f"wiki2vec_w10_{i}d.pkl" for i in [100, 300]] 
embs += ['word2vec-google-news-300', 'glove-wiki-gigaword-300']
entity_desc_dict = load_json(os.path.join(os.getcwd(), 'data', 'aida', 'entities.json'))
for emb in embs:
    model = GBRT(os.path.join(EMB_PATH, emb), cased = 'word2vec' in emb)
    model.entity_desc_dict = entity_desc_dict
    train_df = generate_train_data(model)
    train_df.to_csv(f"./data/GBRT/{emb}_train.csv", index=False)
    model = None
    gc.collect()

# Train the GBRT (Original)

In [None]:
X = pd.read_csv("./data/GBRT/wiki2vec_w10_300d.pkl_train.csv")
y_train = X['y'].to_numpy()

In [None]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)

In [None]:
model.fit(X[features[:4]].to_numpy(), y_train)
save_model(model, './data/GBRT/base.pkl')

In [None]:
model.fit(X[features[:8]].to_numpy(), y_train)
save_model(model, './data/GBRT/string_sim.pkl')

In [None]:
model.fit(X[features[:9]].to_numpy(), y_train)
save_model(model, './data/GBRT/context.pkl')

In [None]:
model.fit(X[features].to_numpy(), y_train)
save_model(model, './data/GBRT/coherence.pkl')

# Train GBRT

In [4]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)

embs  = [f"wiki2vec_w10_{i}d.pkl" for i in [100, 300]] 
embs += ['word2vec-google-news-300', 'glove-wiki-gigaword-300']

for emb in embs:
    X = pd.read_csv(f"./data/GBRT/{emb}_train.csv")
    model.fit(X[features].to_numpy(), X['y'].to_numpy())
    save_model(model, f"./data/GBRT/{emb}_trained.pkl")

      Iter       Train Loss   Remaining Time 
         1           0.0719          110.48m
         2           0.0698          109.41m
         3           0.0677           98.74m
         4           0.0657           90.78m
         5           0.0638           87.18m
         6           0.0619           85.11m
         7           0.0602           83.02m
         8           0.0584           81.49m
         9           0.0568           80.08m
        10           0.0552           79.73m
        20           0.0424           72.88m
        30           0.0335           71.25m
        40           0.0273           70.67m
        50           0.0231           70.20m
        60           0.0201           69.57m
        70           0.0180           69.21m
        80           0.0165           69.23m
        90           0.0154           69.11m
       100           0.0146           69.30m
       200           0.0120           67.76m
       300           0.0113           65.80m
       40