### Setup

In [1]:
import gc
import os
import pandas as pd

from src.base import Base
from src.gbrt import GBRT
from src.transformer import BaseTRF, GBRT_TRF
from src.utils import load_pickle
from test import aida_global, aida_local

In [3]:
EMB_PATH = os.path.join(os.getcwd(), 'embeddings')

embs  = ["wiki2vec_w10_100d.pkl", "word2vec-google-news-300",
         "glove-wiki-gigaword-300", "fasttext-wiki-news-subwords-300"]

entities = load_pickle('./data/aida/entities.pkl')
entities_filtered = load_pickle('./data/aida/entities_filtered.pkl')
entities_full = load_pickle('./data/aida/entities_full.pkl')

## Test Baseline Models

### Local Context

Investigate the effects of cased vs uncased

In [3]:
results = []
for emb in embs[1:]:
    res = [emb]
    model = Base(os.path.join(EMB_PATH, emb))
    model.entity_desc_dict = entities
    for case in [True, False]:
        model.cased = case
        acc, _ = aida_local(model)
        res.append(acc)
    results.append(res)
    model = None
    gc.collect()

# Wikipedia2Vec 100D
acc, _ = aida_local(Base(os.path.join(EMB_PATH, embs[0])))
results.append([embs[0], None, acc])

# Transformer
model = BaseTRF()
model.entity_desc_dict = entities
acc, _ = aida_local(model)
results.append(['Transformer Base', acc, None])

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (cased)', 'Accuracy (uncased)'])

100%|██████████| 231/231 [01:19<00:00,  2.90it/s]
100%|██████████| 231/231 [01:20<00:00,  2.85it/s]
100%|██████████| 231/231 [01:22<00:00,  2.81it/s]
100%|██████████| 231/231 [01:21<00:00,  2.83it/s]
100%|██████████| 231/231 [01:26<00:00,  2.66it/s]
100%|██████████| 231/231 [01:14<00:00,  3.11it/s]
100%|██████████| 231/231 [01:00<00:00,  3.85it/s]
100%|██████████| 231/231 [18:41<00:00,  4.86s/it]


Unnamed: 0,Embedding,Accuracy (cased),Accuracy (uncased)
0,word2vec-google-news-300,51.461,44.808
1,glove-wiki-gigaword-300,51.566,51.285
2,fasttext-wiki-news-subwords-300,43.647,42.344
3,wiki2vec_w10_100d.pkl,,65.118
4,Transformer Base,52.165,


Investigate the effects of keeping nouns only

In [4]:
results = []
for emb in embs:
    model = Base(os.path.join(EMB_PATH, emb), nouns_only=True,
                 cased=(emb in [embs[1], embs[3]]))
    model.entity_desc_dict = entities
    acc, _ = aida_local(model)
    results.append([emb, acc])
    model = None
    gc.collect()

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (Nouns Only)'])

100%|██████████| 231/231 [01:06<00:00,  3.46it/s]
100%|██████████| 231/231 [02:33<00:00,  1.51it/s]
100%|██████████| 231/231 [02:28<00:00,  1.56it/s]
100%|██████████| 231/231 [02:46<00:00,  1.39it/s]


Unnamed: 0,Embedding,Accuracy (Nouns Only)
0,wiki2vec_w10_100d.pkl,62.619
1,word2vec-google-news-300,51.109
2,glove-wiki-gigaword-300,49.243
3,fasttext-wiki-news-subwords-300,45.759


### Global Context

In [5]:
results = []
for emb in embs:
    model = Base(os.path.join(EMB_PATH, emb), cased=(emb in [embs[1], embs[3]]))
    model.entity_desc_dict = entities
    acc, _ = aida_local(model, use_document=True)
    results.append([emb, acc])
    model = None
    gc.collect()

# Transformer
model = BaseTRF()
model.entity_desc_dict = entities
acc, _ = aida_local(model, use_document=True)
results.append(['Transformer Base', acc])

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (Context = Whole Document)'])

100%|██████████| 231/231 [00:19<00:00, 12.00it/s]
100%|██████████| 231/231 [00:41<00:00,  5.54it/s]
100%|██████████| 231/231 [00:31<00:00,  7.35it/s]
100%|██████████| 231/231 [00:29<00:00,  7.77it/s]
100%|██████████| 231/231 [22:26<00:00,  5.83s/it] 


Unnamed: 0,Embedding,Accuracy (Context = Whole Document)
0,wiki2vec_w10_100d.pkl,57.709
1,word2vec-google-news-300,57.214
2,glove-wiki-gigaword-300,56.85
3,fasttext-wiki-news-subwords-300,41.334
4,Transformer Base,57.115


Investigating the effects of using global context with nouns only and using whole wikipedia page to encode an entity

In [6]:
results = []
for emb in embs[1:]:
    model = Base(os.path.join(EMB_PATH, emb), nouns_only=True,
                 cased=(emb in [embs[1], embs[3]]))
    model.entity_desc_dict = entities_filtered
    acc, _ = aida_local(model, use_document=True)
    results.append([emb, acc])
    model = None
    gc.collect()

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (Nouns Only, whole document, full desc)'])

100%|██████████| 231/231 [21:31<00:00,  5.59s/it]
100%|██████████| 231/231 [18:05<00:00,  4.70s/it]
100%|██████████| 231/231 [20:00<00:00,  5.20s/it]


Unnamed: 0,Embedding,"Accuracy (Nouns Only, whole document, full desc)"
0,word2vec-google-news-300,69.66
1,glove-wiki-gigaword-300,63.123
2,fasttext-wiki-news-subwords-300,57.016


In [5]:
model = BaseTRF()
model.entity_desc_dict = entities_full
acc, _ = aida_local(model, use_document=True)
print("Accuracy:", acc)

100%|██████████| 231/231 [1:21:44<00:00, 21.23s/it]

Accuracy: 57.214





## Test GBRT (and it's variations)

In [7]:
results = []
pretrained = ['base.pkl', 'string_sim.pkl', 'context.pkl']
for i in pretrained:
    model = GBRT(os.path.join(EMB_PATH, embs[0]), model_path=i)
    acc, res = aida_global(model)
    results.append([i[:-4], acc])
    model = None
    gc.collect()

pd.DataFrame(results, columns=['Model', 'Accuracy'])

100%|██████████| 231/231 [03:49<00:00,  1.01it/s]
100%|██████████| 231/231 [03:18<00:00,  1.16it/s]
100%|██████████| 231/231 [03:56<00:00,  1.02s/it]


Unnamed: 0,Model,Accuracy
0,base,84.913
1,string_sim,86.794
2,context,89.435


In [10]:
embs.insert(1, 'wiki2vec_w10_300d.pkl')

results = []
for emb in embs:
    model = GBRT(os.path.join(EMB_PATH, emb), model_path=f"{emb}_trained.pkl",
                 cased=(emb in [embs[2], embs[4]]))
    model.entity_desc_dict = entities_filtered
    res = []
    for x in [False, True]:
        model.two_step = x
        acc, _ = aida_global(model)
        res.append(acc)
    results.append([emb, *res])
    model = None
    gc.collect()
    
pd.DataFrame(results, columns=['Model', 'Accuracy', 'Accuracy (with two-step)'])

100%|██████████| 231/231 [03:56<00:00,  1.02s/it]
100%|██████████| 231/231 [03:24<00:00,  1.13it/s]
100%|██████████| 231/231 [04:06<00:00,  1.07s/it]
100%|██████████| 231/231 [04:05<00:00,  1.06s/it]
100%|██████████| 231/231 [03:38<00:00,  1.06it/s]
100%|██████████| 231/231 [03:13<00:00,  1.19it/s]
100%|██████████| 231/231 [20:06<00:00,  5.22s/it]
100%|██████████| 231/231 [24:46<00:00,  6.43s/it] 
100%|██████████| 231/231 [24:23<00:00,  6.34s/it]
100%|██████████| 231/231 [43:51<00:00, 11.39s/it] 
100%|██████████| 231/231 [31:09<00:00,  8.09s/it] 
100%|██████████| 231/231 [36:28<00:00,  9.47s/it] 


Unnamed: 0,Model,Accuracy,Accuracy (with two-step)
0,wiki2vec_w10_100d.pkl,92.242,92.473
1,wiki2vec_w10_300d.pkl,91.548,91.515
2,wiki2vec_w10_300d.pkl,91.548,91.515
3,word2vec-google-news-300,89.072,89.138
4,glove-wiki-gigaword-300,90.723,90.096
5,fasttext-wiki-news-subwords-300,89.204,89.369


In [7]:
model = GBRT_TRF(ranker_path="TRF_trained.pkl")
res = []
for x in [False, True]:
    model.two_step = x
    model.entity_desc_dict = entities
    acc, _ = aida_global(model)
    res.append(acc)

print(f"Transformer GBRT\nAccuracy\t\t{res[0]}\nAccuracy (two-step)\t{res[1]}")

Transformer GBRT
Accuracy		89.601
Accuracy (two-step)	89.634
