In [1]:
from IPython.display import display
from pprint import pprint
from flair.embeddings import BytePairEmbeddings, WordEmbeddings, StackedEmbeddings
from flair.embeddings import DocumentPoolEmbeddings, DocumentLSTMEmbeddings

from src.ned import BasicFlairNED
from src.utils import WikiDataSearch

import warnings
warnings.filterwarnings('ignore')

In [2]:
examples = [
    ["London", "Crime is on the rise in London, which is of grave concern to the regional government in Ontario"],
    ["jaguar", "the prey saw the jaguar cross the jungle"],
    ["blackberry", "The blackberry is an edible fruit produced by many species in the Rubus genus in the Rosaceae family, hybrids among these species within the Rubus subgenus, and hybrids between the Rubus and Idaeobatus subgenera"],
    ["BlackBerry", "During the second financial quarter of 2013, BlackBerry sold 6.8 million handsets, but was eclipsed by the sales of competitor Nokia's Lumia model for the first time"],
    ["string", "You can manipulate strings in Python"],
    ["python", "A python is typically not venomous"],
    ["queen", "In England, the queen still has a lot of power"],
]

for i in examples:
    i.append([(i['label'], i['description']) for i in WikiDataSearch(i[0], num_results=5)])

In [3]:
# Word embeddings
glove = WordEmbeddings('en-glove')
byte_pair = BytePairEmbeddings('en', dim=300)
stacked = StackedEmbeddings([glove, byte_pair])

# Document embeddings
word_embs = [glove, byte_pair, stacked]
doc_pool_emb = [DocumentPoolEmbeddings([i], fine_tune_mode='nonlinear') for i in word_embs]
doc_lstm_emb = [DocumentLSTMEmbeddings([i]) for i in word_embs]

**Hows does filtering stopwords affect the accuracy?**

Filtering yields higher accuracy.

In [4]:
# 4.5s
i = 1
for mention, context, candidates in examples:
    print(f'Example {i}')
    for j in doc_pool_emb:
        print(BasicFlairNED(j).link(mention, context, candidates=candidates, filter=False))  
    print('')
    i += 1

Example 1
['London', 'capital and largest city of the United Kingdom', 0.9563388228416443]
['London', 'capital and largest city of the United Kingdom', 0.8743845224380493]
['London', 'capital and largest city of the United Kingdom', 0.9134705662727356]

Example 2
['Jaguar', 'species of big cat native to the Americas', 0.9432498216629028]
['Jaguar', 'species of big cat native to the Americas', 0.813769519329071]
['Jaguar', 'species of big cat native to the Americas', 0.8740254044532776]

Example 3
['Morus nigra', 'species of plant', 0.9250785708427429]
['Rubus subg. Rubus', 'subgenus of plants', 0.8898826837539673]
['Morus nigra', 'species of plant', 0.9005131721496582]

Example 4
['BlackBerry Limited', 'enterprise software and the Internet of things company', 0.932644248008728]
['BlackBerry Limited', 'enterprise software and the Internet of things company', 0.8701114654541016]
['BlackBerry Limited', 'enterprise software and the Internet of things company', 0.9008198380470276]

Example 

In [5]:
i = 1
for mention, context, candidates in examples:
    print(f'Example {i}')
    for j in doc_pool_emb:
        print(BasicFlairNED(j).link(mention, context, candidates=candidates))   
    print('')
    i += 1

Example 1
['London', 'capital and largest city of the United Kingdom', 0.8947444558143616]
['London, Ontario', 'city in Southwestern Ontario, Canada', 0.8091950416564941]
['London', 'capital and largest city of the United Kingdom', 0.8124164938926697]

Example 2
['Jaguar', 'species of big cat native to the Americas', 0.866437554359436]
['Jaguar', 'species of big cat native to the Americas', 0.708823561668396]
['Jaguar', 'species of big cat native to the Americas', 0.7738799452781677]

Example 3
['Rubus subg. Rubus', 'subgenus of plants', 0.876771092414856]
['Rubus subg. Rubus', 'subgenus of plants', 0.8768600225448608]
['Rubus subg. Rubus', 'subgenus of plants', 0.8649259805679321]

Example 4
['BlackBerry', 'line of wireless handheld devices and services', 0.8552094101905823]
['BlackBerry', 'line of wireless handheld devices and services', 0.7980839014053345]
['BlackBerry Limited', 'enterprise software and the Internet of things company', 0.8125943541526794]

Example 5
['String (music)

**LSTM vs pooled document embeddings**

glove embeddings are best for LSTM document embeddings

In [6]:
i = 1
for mention, context, candidates in examples:
    print(f'Example {i}')
    for j in doc_lstm_emb:
        print(BasicFlairNED(j).link(mention, context, candidates=candidates))   
    print('')
    i += 1

Example 1
['London, Kentucky', 'city in Kentucky, United States', 0.39464110136032104]
['London, Ontario', 'city in Southwestern Ontario, Canada', 0.39206093549728394]
['London Recordings', 'record label headquartered in the United Kingdom', 0.12494296580553055]

Example 2
['Jaguar', 'species of big cat native to the Americas', 0.2696046233177185]
['Jaguar', 'species of big cat native to the Americas', 0.11995638161897659]
['SEPECAT Jaguar', 'attack aircraft family by SEPECAT', 0.07468104362487793]

Example 3
['Morus nigra', 'species of plant', 0.14807263016700745]
['Morus nigra', 'species of plant', 0.2621464431285858]
['Morus nigra', 'species of plant', 0.1737596094608307]

Example 4
['BlackBerry Limited', 'enterprise software and the Internet of things company', 0.3491867184638977]
['Morus nigra', 'species of plant', 0.1866304576396942]
['BlackBerry Limited', 'enterprise software and the Internet of things company', 0.23131796717643738]

Example 5
['String (structure)', 'long, flexi