# Setup

In [None]:
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from src.GBRT import GBRT
from src.GBRT.utils import *
from src.utils import cos_sim, get_document
from tqdm import tqdm

EMB_PATH = "C:\\Personal Files\\NED-using-KG\\embeddings\\wiki2vec_w10_100d.pkl"

In [None]:
def save_model(model, fname):
    with open(fname, 'wb') as f:
        pickle.dump(model, f)

# Generate training data

In [None]:
dfs = []
gbrt = GBRT(EMB_PATH)
for i in tqdm(range(1, 1163)):
    data = pd.read_csv(f'./data/aida/candidates/{i}.csv')
    mentions = data['mention'].unique()
    candidates = data['candidate'].unique()
    max_prob = get_max_prior_prob(mentions, candidates)
    
    # Base features
    data['priorProb'] = [get_prior_prob(i[1], i[2])
                         for i in data[['candidate', 'mention']].itertuples()]
    data['entityPrior'] = data['candidate'].map(get_entity_prior)
    data['maxPriorProb'] = data['candidate'].map(max_prob)
    
    # String similarity features
    ment_normalised = data['mention'].map(lambda x: x.lower())
    cand_normalised = data['candidate'].map(lambda x: x.lower().replace('_', ' '))
    ment_cand = list(zip(ment_normalised, cand_normalised))
    data['editDist'] = [get_edit_dist(m, c) for m, c in ment_cand]
    data['mentionIsCand'] = [m == c for m, c in ment_cand]
    data['mentionInCand'] = [m in c for m, c in ment_cand]
    data['isStartorEnd'] = [c.startswith(m) or c.endswith(m) for m, c in ment_cand]

    # Context based features
    # Context similarity 
    context_emb = gbrt.encode_sentence(get_document(i))
    data['contextSim'] = data['candidate'].map(
        lambda x: cos_sim(gbrt.encode_entity(x), context_emb))
    # Coherence score
    unamb_entities = data[data['priorProb'] >= 0.95]['candidate'].unique()
    context_ent_emb = gbrt.encode_context_entities(unamb_entities)
    data['coherence'] = data['candidate'].map(
        lambda x: cos_sim(gbrt.encode_entity(x), context_ent_emb))
    # Add rank
    # data = rank_values(data)

    # Add ground truth
    data['y'] = (data['candidate'] == data['tag']).map(int)
    dfs.append(data)

X = pd.concat(dfs).reset_index(drop=True)

In [None]:
#  add rank
dfs = []
while X.shape[0] != 0:
    n = X.iloc[0]['numCands']
    temp = X.head(n).copy()
    temp['score'] = temp.contextSim	+ temp.coherence
    temp = temp.sort_values(by=['score'], ascending=False).reset_index(drop=True)
    temp['rank'] = temp.index + 1
    X = X.iloc[n:]
    dfs.append(temp)

print(len(dfs))
X = pd.concat(dfs).reset_index(drop=True)
X.to_csv('./data/GBRT/train.csv', index=False)

# Train the GBRT Regressor

In [None]:
X = pd.read_csv('./data/GBRT/train.csv')
X

In [None]:
X_train = X.drop(columns=['mention', 'candidate', 'tag', 'y'])
y_train = X['y'].to_numpy()

BASE = ['priorProb', 'entityPrior', 'maxPriorProb', 'numCands']
STRING_SIM = BASE + ['editDist', 'mentionIsCand', 'mentionInCand', 'isStartorEnd']
CONTEXT = STRING_SIM + ['contextSim']
ALL = CONTEXT + ['coherence', 'rank']

In [None]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[BASE].to_numpy(), y_train)
save_model(model, './data/GBRT/base.pkl')

In [None]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[STRING_SIM].to_numpy(), y_train)
save_model(model, './data/GBRT/string_sim.pkl')

In [None]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[CONTEXT].to_numpy(), y_train)
save_model(model, './data/GBRT/context.pkl')

In [None]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train[ALL[:-1]].to_numpy(), y_train)
save_model(model, './data/GBRT/coherence_no_rank.pkl')

In [None]:
model = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.02,
                                  max_depth=4, random_state=0, verbose=True)
model.fit(X_train.to_numpy(), y_train)
save_model(model, './data/GBRT/coherence.pkl')