In [1]:
import warnings
warnings.filterwarnings('ignore')

# Setup

In [2]:
import jellyfish
import pandas as pd
import flair.datasets as datasets
from data.utils import getCandidates, getDocument
from src.models.gbrt import GBRT
from wikipedia2vec import Wikipedia2Vec
from tqdm import tqdm

In [3]:
EMB_PATH = "./embeddings/"
aida = datasets.NEL_ENGLISH_AIDA()
wiki2vec = Wikipedia2Vec.load(EMB_PATH + 'wiki2vec_w10_100d.pkl')
model = GBRT(wiki2vec)

2021-11-24 23:07:00,632 Reading data from C:\Users\athar\.flair\datasets\nel_english_aida
2021-11-24 23:07:00,634 Train: C:\Users\athar\.flair\datasets\nel_english_aida\train
2021-11-24 23:07:00,635 Dev: C:\Users\athar\.flair\datasets\nel_english_aida\testa
2021-11-24 23:07:00,636 Test: C:\Users\athar\.flair\datasets\nel_english_aida\testb


In [4]:
mentions_tags = []
doc = 1162
for i in aida.test:
	context = i.to_plain_string()
	if context != '-DOCSTART-':
		mentions_tags += [[j.text, j.tag, context, doc] for j in i.get_spans()]
	else:
		doc += 1

In [5]:
def get_candidates(mention, doc):
	df = getCandidates(doc, mention=mention)
	return [i.split('/')[-1] for i in df['url'].values]

# Initial Tests

Use sentence of the mention as the context

In [7]:
preds = []
for mention, tag, context, docNum in tqdm(mentions_tags):
	docText = getDocument(docNum)
	candidates = get_candidates(mention, docNum)
	if tag in candidates:
		pred, conf = model.link(mention, context, candidates)[0]
		preds.append([mention, tag, pred.replace(' ','_')])

accuracy = round((sum([1 for _, t, p in preds if t == p]) / len(preds)) * 100, 2)
print(f'Accuracy: {accuracy}%\nTotal test samples: {len(preds)}')

100%|██████████| 4497/4497 [00:49<00:00, 90.44it/s] 

Accuracy: 67.95%
Total test samples: 4250





Use the whole document as the context

In [8]:
preds = []
for mention, tag, context, docNum in tqdm(mentions_tags):
	docText = getDocument(docNum)
	candidates = get_candidates(mention, docNum)
	if tag in candidates:
		pred, conf = model.link(mention, docText, candidates)[0]
		preds.append([mention, tag, pred.replace(' ','_')])

accuracy = round((sum([1 for _, t, p in preds if t == p]) / len(preds)) * 100, 2)
print(f'Accuracy: {accuracy}%\nTotal test samples: {len(preds)}')

100%|██████████| 4497/4497 [03:08<00:00, 23.88it/s]

Accuracy: 64.38%
Total test samples: 4250





# Gathering Training Data

In [22]:
mentions_tags = []
doc = 0
for i in aida.train:
	context = i.to_plain_string()
	if context != '-DOCSTART-':
		mentions_tags += [[j.text, j.tag, doc] for j in i.get_spans()]
	else:
		doc += 1

In [23]:
def edit_dist(x, y): 
	return jellyfish.levenshtein_distance(x, y)

preds = []
for mention, tag, docNum in tqdm(mentions_tags):
	docText = getDocument(docNum)
	candidates = get_candidates(mention, docNum)
	if tag in candidates:
		scores = model.link(mention, docText, candidates)
		mention = mention.lower()
		for i, j in scores:
			i = i.replace('_', ' ')
			entry = [
				i,                           # Candidate entity
				mention,                     # Mention
				0,                           # Prior probability
				0,                           # Entity prior
				0,                           # Max prior prob of E of all mentions in the document
				len(scores),                 # Num candidates
				edit_dist(mention, i),       # Edit dist between mention and E title
				int(mention in i),           # If mention == E title or contained in it
				int(i.startswith(mention) or i.endswith(mention)),
				j,                           # Context similarity score
				0,                           # Coherence score
				0,                           # Rank
				int(i == tag)                # y (is ground truth?)
			]
			preds.append(entry)

100%|██████████| 18601/18601 [15:44<00:00, 19.68it/s]


In [24]:
train_data = pd.DataFrame(preds, columns=['candidate', 'mention', 'priorProb', 'entPrior', 'maxProb',
                          'numCands', 'editDist', 'contextInE', 'startOrEnds', 'contextSim', 'coherenceScore', 
                          'rank', 'y'])


In [25]:
train_data

Unnamed: 0,candidate,mention,priorProb,entPrior,maxProb,numCands,editDist,contextInE,startOrEnds,contextSim,coherenceScore,rank,y
0,German model,german,0,0,0,32,7,0,0,0.491980,0,0,0
1,German Party (1961),german,0,0,0,32,14,0,0,0.483690,0,0,0
2,"German, Iran",german,0,0,0,32,7,0,0,0.463202,0,0,0
3,"German Township, Kossuth County, Iowa",german,0,0,0,32,32,0,0,0.430681,0,0,0
4,German–Polish Non-Aggression Pact,german,0,0,0,32,28,0,0,0.422896,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
174076,Lincoln Park,lincoln,0,0,0,26,6,0,0,0.291625,0,0,0
174077,Lincoln (proposed Southern state),lincoln,0,0,0,26,27,0,0,0.278966,0,0,0
174078,Lincoln University (California),lincoln,0,0,0,26,25,0,0,0.278890,0,0,0
174079,Lincoln (band),lincoln,0,0,0,26,8,0,0,0.275661,0,0,0
