### Setup

In [1]:
import gc
import os
import pandas as pd

from src.base import Base
from src.gbrt import GBRT
from test import aida_global, aida_local

EMB_PATH = os.path.join(os.getcwd(), 'embeddings')

## Test Baseline Models

In [2]:
embs = ['word2vec-google-news-300', 'glove-wiki-gigaword-300',
        'fasttext-wiki-news-subwords-300', 'en.wiki.bpe.vs200000.d300.w2v',
        'wiki2vec_w10_100d.pkl']

results = []
for emb in embs:
    res = [emb]
    model = Base(os.path.join(EMB_PATH, emb))
    for filter in [False, True]:
        model.nouns_only = filter
        acc, _ = aida_local(model)
        res.append(acc)
    results.append(res)

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy', 'Accuracy (Nouns Only)'])

100%|██████████| 231/231 [01:16<00:00,  3.03it/s]
100%|██████████| 231/231 [02:55<00:00,  1.32it/s]
100%|██████████| 231/231 [01:17<00:00,  2.99it/s]
100%|██████████| 231/231 [02:44<00:00,  1.41it/s]
100%|██████████| 231/231 [01:12<00:00,  3.18it/s]
100%|██████████| 231/231 [02:45<00:00,  1.39it/s]
100%|██████████| 231/231 [01:19<00:00,  2.91it/s]
100%|██████████| 231/231 [02:52<00:00,  1.34it/s]
100%|██████████| 231/231 [01:00<00:00,  3.83it/s]
100%|██████████| 231/231 [00:59<00:00,  3.89it/s]


Unnamed: 0,Embedding,Accuracy,Accuracy (Nouns Only)
0,word2vec-google-news-300,50.334,50.757
1,glove-wiki-gigaword-300,38.93,26.892
2,fasttext-wiki-news-subwords-300,42.555,44.914
3,en.wiki.bpe.vs200000.d300.w2v,25.554,21.331
4,wiki2vec_w10_100d.pkl,65.153,61.211


In [3]:
results = []
for emb in embs[:-1]:
    res = [emb]
    model = Base(os.path.join(EMB_PATH, emb))
    for case in [True, False]:
        model.cased = case
        acc, _ = aida_local(model)
        res.append(acc)
    results.append(res)

# Wikipedia2Vec baseline
acc, _ = aida_local(Base(os.path.join(EMB_PATH, "wiki2vec_w10_100d.pkl")))
results.append(['wiki2vec_w10_100d', None, acc])

# Results
pd.DataFrame(results, columns=['Embedding', 'Accuracy (cased)', 'Accuracy (uncased)'])

100%|██████████| 231/231 [01:22<00:00,  2.80it/s]
100%|██████████| 231/231 [01:21<00:00,  2.85it/s]
100%|██████████| 231/231 [01:21<00:00,  2.84it/s]
100%|██████████| 231/231 [01:22<00:00,  2.78it/s]
100%|██████████| 231/231 [01:22<00:00,  2.81it/s]
100%|██████████| 231/231 [01:22<00:00,  2.80it/s]
100%|██████████| 231/231 [01:21<00:00,  2.85it/s]
100%|██████████| 231/231 [01:23<00:00,  2.76it/s]
100%|██████████| 231/231 [01:00<00:00,  3.84it/s]


Unnamed: 0,Embedding,Accuracy (cased),Accuracy (uncased)
0,word2vec-google-news-300,50.334,44.562
1,glove-wiki-gigaword-300,38.93,50.088
2,fasttext-wiki-news-subwords-300,42.555,41.711
3,en.wiki.bpe.vs200000.d300.w2v,25.554,41.922
4,wiki2vec_w10_100d,,65.153


## Test GBRT (and it's variations)

In [4]:
results = []
pretrained = ['base.pkl', 'string_sim.pkl', 'context.pkl', 'coherence.pkl']
for i in pretrained:
    model = GBRT(os.path.join(EMB_PATH, "wiki2vec_w10_100d.pkl"), model_path=i)
    acc, res = aida_global(model)
    results.append([i[:-4], acc])

model.two_step = True
acc, res = aida_global(model)
results.append([i[:-4] + ' (two - step)', acc])
pd.DataFrame(results, columns=['Model', 'Accuracy'])

100%|██████████| 231/231 [03:34<00:00,  1.08it/s]
100%|██████████| 231/231 [03:45<00:00,  1.02it/s]
100%|██████████| 231/231 [03:52<00:00,  1.01s/it]
100%|██████████| 231/231 [03:58<00:00,  1.03s/it]
100%|██████████| 231/231 [04:05<00:00,  1.06s/it]


Unnamed: 0,Model,Accuracy
0,base,84.913
1,string_sim,86.893
2,context,89.601
3,coherence,92.737
4,coherence (two - step),92.539


In [5]:
embs = [f"wiki2vec_w10_{i}d.pkl" for i in [100, 300]]
embs += ['word2vec-google-news-300', 'glove-wiki-gigaword-300']
results = []
for emb in embs:
    model = GBRT(os.path.join(EMB_PATH, emb),
                 model_path=f"{emb}_trained.pkl",
                 cased='word2vec' in emb)
    acc, _ = aida_global(model)
    results.append([emb, acc])
    model = None
    gc.collect()

pd.DataFrame(results, columns=['Model', 'Accuracy'])

100%|██████████| 231/231 [04:48<00:00,  1.25s/it]
100%|██████████| 231/231 [04:38<00:00,  1.21s/it]
100%|██████████| 231/231 [06:44<00:00,  1.75s/it]
100%|██████████| 231/231 [05:39<00:00,  1.47s/it]


Unnamed: 0,Model,Accuracy
0,wiki2vec_w10_100d.pkl,92.506
1,wiki2vec_w10_300d.pkl,91.779
2,word2vec-google-news-300,91.746
3,glove-wiki-gigaword-300,91.383
