### Setup

In [4]:
import pandas as pd
from src.base import Base, BaseWiki2Vec
from src.gbrt import GBRT, GBRT2
from src.utils import test_local, test_global, load_json

EMB_PATH = "C:\\Personal Files\\NED-using-KG\\embeddings\\"

## Test Baseline Models

In [2]:
base_results = []
embs = ['word2vec-google-news-300', 'glove-wiki-gigaword-300',
        'fasttext-wiki-news-subwords-300', 'en.wiki.bpe.vs200000.d300.w2v']

# Normal Baselines
for emb in embs:
    res = [emb]
    model = Base(EMB_PATH + emb)
    for case in [True, False]:
        model.cased = case
        acc, _ = test_local(model)
        res.append(acc)
    base_results.append(res)

# Wikipedia2Vec baseline
acc, _ = test_local(BaseWiki2Vec(EMB_PATH + "wiki2vec_w10_100d.pkl"))
base_results.append(['wiki2vec_w10_100d', None, acc])

# Results
pd.DataFrame(base_results, columns=['Embedding', 'Accuracy (cased)', 'Accuracy (uncased)'])

100%|██████████| 231/231 [03:37<00:00,  1.06it/s]
100%|██████████| 231/231 [03:45<00:00,  1.02it/s]
100%|██████████| 231/231 [02:57<00:00,  1.30it/s]
100%|██████████| 231/231 [02:29<00:00,  1.55it/s]
100%|██████████| 231/231 [02:48<00:00,  1.37it/s]
100%|██████████| 231/231 [02:45<00:00,  1.40it/s]
100%|██████████| 231/231 [02:42<00:00,  1.42it/s]
100%|██████████| 231/231 [02:43<00:00,  1.42it/s]
100%|██████████| 231/231 [01:18<00:00,  2.95it/s]


Unnamed: 0,Embedding,Accuracy (cased),Accuracy (uncased)
0,word2vec-google-news-300,52.27,46.779
1,glove-wiki-gigaword-300,43.999,52.024
2,fasttext-wiki-news-subwords-300,40.655,39.845
3,en.wiki.bpe.vs200000.d300.w2v,32.981,45.583
4,wiki2vec_w10_100d,65.153,65.153


## Test GBRT (and it's variations)

In [3]:
results = []
pretrained = ['base.pkl', 'string_sim.pkl', 'context.pkl', 'coherence.pkl']
for i in pretrained:
    model = GBRT(EMB_PATH + "wiki2vec_w10_100d.pkl", model_path=i)
    acc, res = test_global(model)
    results.append([i[:-4], acc, res.shape[0]])

model.two_step = True
acc, res = test_global(model)
results.append([i[:-4] + ' (two - step)', acc, res.shape[0]])
pd.DataFrame(results, columns=['Model', 'Accuracy', 'Mentions Tested'])

100%|██████████| 231/231 [07:49<00:00,  2.03s/it]
100%|██████████| 231/231 [07:33<00:00,  1.96s/it]
100%|██████████| 231/231 [09:02<00:00,  2.35s/it]
100%|██████████| 231/231 [09:16<00:00,  2.41s/it]
100%|██████████| 231/231 [07:06<00:00,  1.85s/it]


Unnamed: 0,Model,Accuracy,Mentions Tested
0,base,84.913,3029
1,string_sim,86.893,3029
2,context,90.063,3029
3,coherence,92.803,3029
4,coherence (two - step),92.539,3029


In [3]:
embs = [f"wiki2vec_w10_{i}d.pkl" for i in [100, 300]]
for emb in embs:
    model = GBRT(EMB_PATH + emb, model_path=f"{emb}_trained.pkl")
    acc, _ = test_global(model)
    print(emb, acc)

  0%|          | 0/231 [00:00<?, ?it/s]

wiki2vec_w10_100d.pkl 92.803


  0%|          | 0/231 [00:00<?, ?it/s]

wiki2vec_w10_300d.pkl 92.044


In [5]:
model = GBRT2(EMB_PATH + 'glove-wiki-gigaword-300', model_path="glove-wiki-gigaword-300_trained.pkl")
model.entity_desc_dict = load_json("C:\\Personal Files\\NED-using-KG\\data\\aida\\entities.json")
acc, res = test_global(model, is_wiki2vec=False)
acc

  0%|          | 0/231 [00:00<?, ?it/s]

89.997