# Setup

In [1]:
import pandas as pd
import flair.datasets as datasets

from tqdm import tqdm 
from flair.embeddings import BytePairEmbeddings, DocumentPoolEmbeddings, WordEmbeddings
from flair.embeddings import TransformerDocumentEmbeddings
from src.ned import BasicFlairNED
from test import get_candidates

In [2]:
aida = datasets.NEL_ENGLISH_AIDA()
mentions_tags = []
doc = 1162
for i in aida.test:
	context = i.to_plain_string()
	if context != '-DOCSTART-':
		mentions_tags += [[j.text, j.tag, context, doc] for j in i.get_spans()]
	else:
		doc += 1

2021-11-16 21:14:16,527 Reading data from C:\Users\athar\.flair\datasets\nel_english_aida
2021-11-16 21:14:16,529 Train: C:\Users\athar\.flair\datasets\nel_english_aida\train
2021-11-16 21:14:16,531 Dev: C:\Users\athar\.flair\datasets\nel_english_aida\testa
2021-11-16 21:14:16,532 Test: C:\Users\athar\.flair\datasets\nel_english_aida\testb


In [3]:
entity_desc = pd.read_csv('./data/entity_desc.csv')

def candidates(mention, doc):
	df = get_candidates(doc)
	cands = [[i.split('/')[-1]] for i in df[df['forMention'] == mention]['url'].values]
	for i in cands:
		try:
			desc = entity_desc[entity_desc['entity'] == i[0]]['description'].values[0]
		except Exception as e:
			desc = ''
		i.append(desc)
	return cands

# Testing base NED model using various types of embeddings

In [4]:
def test_model(model):
	preds = []
	for mention, tag, context, doc in tqdm(mentions_tags):
		cands = candidates(mention, doc)
		# check if the tag is in the candidates
		if tag in [i[0] for i in cands]:
			try:
				pred_tag, desc, conf = model.link(mention, context, candidates=cands)
				preds.append([mention, tag, pred_tag, conf])
			except Exception as e:
				pass

	res = pd.DataFrame(preds, columns=['mention', 'tag', 'predicted', 'confidence'])
	acc = (res[res['tag'] == res['predicted']].shape[0]/res.shape[0])*100
	return res, acc

### Byte-Pair, 300d

In [18]:
byte_pair = BytePairEmbeddings('en', dim=300)
bp_doc_emb = DocumentPoolEmbeddings([byte_pair], fine_tune_mode='nonlinear')
bp_res, bp_acc = test_model(BasicFlairNED(bp_doc_emb, cased=False))
bp_res.to_csv('./results/base_byte_pair.csv')
print('Byte Pair', bp_acc)
# Cased   : 58.42 %
# Uncased : 57.60 %

100%|██████████| 4497/4497 [06:41<00:00, 11.20it/s]

Byte Pair 57.59609517998779





### Glove Wiki-Gigaword 300d

In [6]:
glove = WordEmbeddings('./embeddings/glove-wiki-gigaword-300')
glove_doc_emb = DocumentPoolEmbeddings([glove], fine_tune_mode='nonlinear')
glove_res, glove_acc = test_model(BasicFlairNED(glove_doc_emb, cased=False))
glove_res.to_csv('./results/base_glove.csv')
print('Glove', glove_acc) 
# Cased   : 61.07 %
# Uncased : 63.03 %

100%|██████████| 4497/4497 [05:50<00:00, 12.83it/s]

Glove 63.02623550945698





### FastText Wiki-News Subword 300d

In [7]:
ftext = WordEmbeddings('./embeddings/fasttext-wiki-news-subwords-300')
ftext_doc_emb = DocumentPoolEmbeddings([ftext], fine_tune_mode='nonlinear')
ftext_res, ftext_acc = test_model(BasicFlairNED(ftext_doc_emb))
ftext_res.to_csv('./results/base_fasttext.csv')
print('Fasttext', ftext_acc) 
# Cased   : 50.03 %
# Uncased : 48.81 %

100%|██████████| 4497/4497 [06:04<00:00, 12.33it/s]

Fasttext 49.603416717510676





### Word2Vec Google News 300 Dimensional

In [8]:
w2v = WordEmbeddings('./embeddings/word2vec-google-news-300')
w2v_doc_emb = DocumentPoolEmbeddings([w2v], fine_tune_mode='nonlinear')
w2v_res, w2v_acc = test_model(BasicFlairNED(w2v_doc_emb))
w2v_res.to_csv('./results/base_word2vec.csv')
print('Word2Vec', w2v_acc) 
# Cased   : 58.69 %
# Uncased : 55.46 %

100%|██████████| 4497/4497 [05:38<00:00, 13.29it/s]

Word2Vec 58.69432580841977





In [9]:
# roberta_doc_emb = TransformerDocumentEmbeddings('roberta-base')
# roberta_res, roberta_acc = test_model(BasicFlairNED(roberta_doc_emb))
# roberta_res.to_csv('./results/base_roberta.csv')
# print('Roberta Base', roberta_acc)

**Notes**

- Fix instances which error out when testing