### Setup

In [1]:
import gc
import os
import pandas as pd

from src.base import Base
from src.gbrt import GBRT
from src.transformer import BaseTRF, GBRT_TRF
from test import aida_global, aida_local

EMB_PATH = os.path.join(os.getcwd(), 'embeddings')

embs  = ["wiki2vec_w10_100d.pkl", "wiki2vec_w10_300d.pkl", 
         "word2vec-google-news-300", "glove-wiki-gigaword-300",
         "fasttext-wiki-news-subwords-300", "en.wiki.bpe.vs200000.d300.w2v"]

## Test Baseline Models

In [2]:
model = BaseTRF()
acc, _ = aida_local(model)
results = [['Transformer Base', acc, None]]

for emb in embs[2:]:
    gc.collect()
    res = [emb]
    model = Base(os.path.join(EMB_PATH, emb))
    for case in [True, False]:
        model.cased = case
        acc, _ = aida_local(model)
        res.append(acc)
    results.append(res)

# Wikipedia2Vec 100D baseline
acc, _ = aida_local(Base(os.path.join(EMB_PATH, embs[0])))
results.append([embs[0], None, acc])

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (cased)', 'Accuracy (uncased)'])

Unnamed: 0,Embedding,Accuracy (cased),Accuracy (uncased)
0,Transformer Base,52.165,
1,word2vec-google-news-300,50.334,44.562
2,glove-wiki-gigaword-300,50.088,50.088
3,fasttext-wiki-news-subwords-300,42.555,41.711
4,en.wiki.bpe.vs200000.d300.w2v,41.957,41.922
5,wiki2vec_w10_100d.pkl,,65.153


In [4]:
model = BaseTRF()
acc, _ = aida_local(model, use_document=True)
results = [['Transformer Base', acc]]

for emb in embs:
    model = Base(os.path.join(EMB_PATH, emb), cased = emb in [embs[2], embs[4], embs[5]])
    acc, _ = aida_local(model, use_document=True)
    results.append([emb, acc])
    model = None
    gc.collect()

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (Context = Whole Document)'])

Unnamed: 0,Embedding,Accuracy (Context = Whole Document)
0,Transformer Base,57.115
1,wiki2vec_w10_100d.pkl,56.388
2,word2vec-google-news-300,55.695
3,glove-wiki-gigaword-300,54.473
4,fasttext-wiki-news-subwords-300,41.334
5,en.wiki.bpe.vs200000.d300.w2v,49.092


In [2]:
results = []
for emb in embs:
    model = Base(os.path.join(EMB_PATH, emb), nouns_only=True,
                 cased = emb in [embs[2], embs[4], embs[5]])
    acc, _ = aida_local(model)
    results.append([emb, acc])
    model = None
    gc.collect()

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (Nouns Only)'])

Unnamed: 0,Embedding,Accuracy (Nouns Only)
0,wiki2vec_w10_100d.pkl,61.211
1,wiki2vec_w10_300d.pkl,59.873
2,word2vec-google-news-300,50.722
3,glove-wiki-gigaword-300,47.8
4,fasttext-wiki-news-subwords-300,44.949
5,en.wiki.bpe.vs200000.d300.w2v,42.485


## Test GBRT (and it's variations)

In [4]:
results = []
pretrained = ['base.pkl', 'string_sim.pkl', 'context.pkl']
for i in pretrained:
    model = GBRT(os.path.join(EMB_PATH, embs[0]), model_path=i)
    acc, res = aida_global(model)
    results.append([i[:-4], acc])
    model = None
    gc.collect()

pd.DataFrame(results, columns=['Model', 'Accuracy'])

Unnamed: 0,Model,Accuracy
0,base,84.913
1,string_sim,86.794
2,context,90.195


In [5]:
results = [["Transformer GBRT"]]

model = GBRT_TRF(ranker_path="TRF_trained.pkl")
for x in [False, True]:
    model.two_step = x
    acc, _ = aida_global(model)
    results[0].append(acc)

for emb in embs:
    gc.collect()
    model = GBRT(os.path.join(EMB_PATH, emb), model_path=f"{emb}_trained.pkl",
                 cased=(emb in [embs[2], embs[4], embs[5]]))
    res = []
    for x in [False, True]:
        model.two_step = x
        acc, _ = aida_global(model)
        res.append(acc)
    results.append([emb, *res])
    model = None
    
pd.DataFrame(results, columns=['Model', 'Accuracy', 'Accuracy (with two-step)'])

Unnamed: 0,Model,Accuracy,Accuracy (with two-step)
0,Transformer GBRT,89.601,89.634
1,wiki2vec_w10_100d.pkl,92.506,92.704
2,wiki2vec_w10_300d.pkl,91.779,91.713
3,word2vec-google-news-300,91.746,91.482
4,glove-wiki-gigaword-300,91.119,91.317
5,fasttext-wiki-news-subwords-300,89.931,89.7
6,en.wiki.bpe.vs200000.d300.w2v,90.327,89.568
