In [1]:
from IPython.display import display
from pprint import pprint
from flair.embeddings import BytePairEmbeddings, WordEmbeddings, StackedEmbeddings
from flair.embeddings import DocumentPoolEmbeddings, DocumentLSTMEmbeddings

from src.ned import BasicFlairNED
from src.utils import WikiDataSearch

import warnings
warnings.filterwarnings('ignore')

In [2]:
examples = [
    ["London", "Crime is on the rise in London, which is of grave concern to the regional government in Ontario"],
    ["jaguar", "the prey saw the jaguar cross the jungle"],
    ["blackberry", "The blackberry is an edible fruit produced by many species in the Rubus genus in the Rosaceae family, hybrids among these species within the Rubus subgenus, and hybrids between the Rubus and Idaeobatus subgenera"],
    ["BlackBerry", "During the second financial quarter of 2013, BlackBerry sold 6.8 million handsets, but was eclipsed by the sales of competitor Nokia's Lumia model for the first time"],
    ["string", "You can manipulate strings in Python"],
    ["python", "A python is typically not venomous"],
    ["queen", "In England, the queen still has a lot of power"],
]

for i in examples:
    i.append([(i['label'], i['description']) for i in WikiDataSearch(i[0], num_results=5)])

In [3]:
# Word embeddings
glove = WordEmbeddings('en-glove')
byte_pair = BytePairEmbeddings('en', dim=300)
stacked = StackedEmbeddings([glove, byte_pair])

# Document embeddings
word_embs = [glove, byte_pair, stacked]
doc_pool_emb = [DocumentPoolEmbeddings([i], fine_tune_mode='nonlinear') for i in word_embs]
doc_lstm_emb = [DocumentLSTMEmbeddings([i]) for i in word_embs]

**Hows does filtering stopwords affect the accuracy?**

Filtering yields higher accuracy.

In [4]:
# 4.5s
i = 1
for mention, context, candidates in examples:
    print(f'Example {i}')
    for j in doc_pool_emb:
        print(BasicFlairNED(j).link(mention, context, candidates=candidates, filter=False))  
    print('')
    i += 1

Example 1
['London Recordings', 'record label headquartered in the United Kingdom', 0.9194797873497009]
['London', 'capital and largest city of the United Kingdom', 0.8630125522613525]
['London', 'capital and largest city of the United Kingdom', 0.8723223209381104]

Example 2
['Panthera onca', 'species of big cat native to the Americas', 0.9407569766044617]
['Panthera onca', 'species of big cat native to the Americas', 0.8426408171653748]
['Panthera onca', 'species of big cat native to the Americas', 0.8692948818206787]

Example 3
['Morus nigra', 'species of plant', 0.9366182684898376]
['Rubus subg. Rubus', 'subgenus of plants', 0.9004514813423157]
['Morus nigra', 'species of plant', 0.8973059058189392]

Example 4
['BlackBerry', 'line of wireless handheld devices and services', 0.9404322504997253]
['BlackBerry', 'line of wireless handheld devices and services', 0.8622153997421265]
['BlackBerry', 'line of wireless handheld devices and services', 0.8931212425231934]

Example 5
['string',

In [5]:
i = 1
for mention, context, candidates in examples:
    print(f'Example {i}')
    for j in doc_pool_emb:
        print(BasicFlairNED(j).link(mention, context, candidates=candidates))   
    print('')
    i += 1

Example 1
['London', 'capital and largest city of the United Kingdom', 0.8435491323471069]
['London Recordings', 'record label headquartered in the United Kingdom', 0.776979386806488]
['London', 'capital and largest city of the United Kingdom', 0.7625173330307007]

Example 2
['SEPECAT Jaguar', 'attack aircraft family by SEPECAT', 0.8776717782020569]
['Panthera onca', 'species of big cat native to the Americas', 0.7397872805595398]
['Panthera onca', 'species of big cat native to the Americas', 0.7568628191947937]

Example 3
['Morus nigra', 'species of plant', 0.8856815099716187]
['Rubus subg. Rubus', 'subgenus of plants', 0.8794227242469788]
['Rubus subg. Rubus', 'subgenus of plants', 0.849000871181488]

Example 4
['BlackBerry', 'line of wireless handheld devices and services', 0.8608808517456055]
['BlackBerry', 'line of wireless handheld devices and services', 0.7795313596725464]
['BlackBerry', 'line of wireless handheld devices and services', 0.8076196312904358]

Example 5
['string', 

**LSTM vs pooled document embeddings**

glove embeddings are best for LSTM document embeddings

In [6]:
i = 1
for mention, context, candidates in examples:
    print(f'Example {i}')
    for j in doc_lstm_emb:
        print(BasicFlairNED(j).link(mention, context, candidates=candidates))   
    print('')
    i += 1

Example 1
['London', 'capital and largest city of the United Kingdom', 0.35421788692474365]
['London Recordings', 'record label headquartered in the United Kingdom', 0.0831252858042717]
['London', 'capital and largest city of the United Kingdom', 0.2235107421875]

Example 2
['Panthera onca', 'species of big cat native to the Americas', 0.20707660913467407]
['Atari Jaguar', 'home video game console', 0.187770813703537]
['SEPECAT Jaguar', 'attack aircraft family by SEPECAT', 0.06644438952207565]

Example 3
['Rubus subg. Rubus', 'subgenus of plants', 0.15163472294807434]
['Morus nigra', 'species of plant', 0.24355703592300415]
['Rubus subg. Rubus', 'subgenus of plants', 0.12246812134981155]

Example 4
['BlackBerry', 'line of wireless handheld devices and services', 0.2196783423423767]
['Rubus subg. Rubus', 'subgenus of plants', 0.24233907461166382]
['BlackBerry', 'line of wireless handheld devices and services', 0.24189844727516174]

Example 5
['string', 'sequence of characters, data type