# SET-UP

In [55]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [1]:
def read_file(path):
    with open(path) as f:
        content = f.readlines()
    return content

In [2]:
! ls ../data

aida-yago2-dataset  emerging.test	     README.txt
apw_eng_201010.tsv  emerging.test.annotated  wnut17train.conll
apw_eng_201011.tsv  ner


In [3]:
wnut_path = "../data/wnut17train.conll"

In [4]:
test_path = "../data/emerging.test.annotated"

In [5]:
import pandas as pd
df = pd.read_csv("../data/wnut17train.conll",sep="\t")

In [6]:
import pandas as pd
import csv
df_test = pd.read_csv(test_path, header = None, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',sep="\t")

In [7]:
def get_labels(dataframe):
    labels = []
    for i,row in df.iterrows():
        if(row.O != 'O' and (not isinstance(row.O,float))):
            labels.append(row.O)
    labels = [x[2:len(x)] for x in labels]
    labels = set(labels)
    return labels

In [8]:
labels = get_labels(df)
labels

{'corporation', 'creative-work', 'group', 'location', 'person', 'product'}

In [9]:
def get_words(path):
    content = [x.replace("\t", " ") for x in read_file(path)]
    text = [x.split() for x in content]
    return text

In [10]:
text_train = get_words(wnut_path)
text_test = get_words(test_path)
datasets = [text_train,text_test]

In [11]:
text_test[0:10]

[['&', 'O'],
 ['gt', 'O'],
 [';', 'O'],
 ['*', 'O'],
 ['The', 'O'],
 ['soldier', 'O'],
 ['was', 'O'],
 ['killed', 'O'],
 ['when', 'O'],
 ['another', 'O']]

In [12]:
def squash(list_labels):
    res = []
    for i,x in enumerate(list_labels):
        j = 1
        if(i + j< len(list_labels) and x[1] == 'B' and list_labels[i  + j][1] == 'I'):
            while(i + j < len(list_labels) and list_labels[i + j][1] == 'I'):
                j += 1
            res.append((x[0],x[0] + j -1 ,x[2]))
        elif(x[1] == 'B'):
            res.append([x[0],x[2]])
    return res

In [13]:
def str_i_to_char_i(sent_label):
    res_labels = []
    sent = sent_label[0].split()
    labels = sent_label[1]
    
    for label in labels: 
        if(len(label) == 2):
            start_label_index = label[0]
            end_label_index = label[0]
            label_name = label[1]
        elif(len(label) == 3):
            start_label_index = label[0]
            end_label_index = label[1]
            label_name = label[2]
            


        start_char_index = sum([len(x) for x in sent[0:start_label_index]])    
        end_char_index = start_char_index + sum([len(x) for x in sent[start_label_index:end_label_index + 1]])
        res_labels.append((start_char_index + start_label_index,end_char_index + end_label_index,label_name))
        
            
    return [" ".join(sent), res_labels]

In [14]:
def to_spacy_format(sent_label): 
    sent = sent_label[0]
    labels = sent_label[1]
    res_dict = {'entities': labels}
    return [sent, res_dict]

In [15]:
import nltk
from nltk.tokenize import sent_tokenize 
def create_entities_char_level(text):
    res = []
    sentence = []
    sent_labels = []
    for word_entity_pair in text:
        if(len(word_entity_pair)!=0):
            sentence.append(word_entity_pair[0])
            sent_labels.append(word_entity_pair[1])
        else:
            sent_labels = [(i,x[0], x[2:len(x)]) for i,x in enumerate(sent_labels) if x != 'O' ]
            sent_labels = squash(sent_labels)
            res.append([" ".join(sentence),sent_labels])
            sent_labels = []
            sentence = []
        
    res = [str_i_to_char_i(x) for x in res]
    res = [to_spacy_format(x) for x in res]
    return res

In [16]:
TRAIN_DATA = create_entities_char_level(text_train)
EVAL_DATA = create_entities_char_level(text_test)

In [17]:
TRAIN_DATA[1]

['From Green Newsfeed : AHFA extends deadline for Sage Award to Nov . 5 http://tinyurl.com/24agj38',
 {'entities': [(22, 26, 'group')]}]

In [18]:
EVAL_DATA[0]

['& gt ; * The soldier was killed when another avalanche hit an army barracks in the northern area of Sonmarg , said a military spokesman .',
 {'entities': [(100, 107, 'location')]}]

In [26]:
from collections import defaultdict 
dict_labels = defaultdict(int)
for x in EVAL_DATA:
    entitites = x[1]['entities']
    for y in entitites: 
        label = y[2]
        dict_labels[label] += 1
dict_labels

defaultdict(int,
            {'location': 150,
             'group': 165,
             'person': 429,
             'creative-work': 142,
             'corporation': 66,
             'product': 127})

In [19]:
def compounding(min_batch_size,max_batch_size,len_data):
    factor = 1.001 
    sizes = []
    sizes.append(min_batch_size)
    x = len_data - min_batch_size
    batch_size = min_batch_size
    while x > 0:
        batch_size = batch_size * factor 
        rounded_bs = int(round(batch_size,0))
        x -= rounded_bs
        sizes.append(rounded_bs)
    return sizes
        

In [20]:
def create_batch(train_data):
    if(len(train_data) == 1):
        return train_data
    new_batch = []
    current_size = len(train_data[0][0]) + 1
    new_batch_text = train_data[0][0] + " "
    new_annotations = train_data[0][1]['entities'].copy()
    for i,data in enumerate(train_data):
        text =  data[0]
        annotations = data[1]
        if (i!=0):
            new_batch_text += text + " "
            annotations_batch = [(x[0] + current_size, x[1]+current_size, x[2]) 
                           for x in annotations['entities']]
            new_annotations.extend(annotations_batch)
            annotations_batch = []
            current_size += len(text) + 1
            
            
    new_batch.append(new_batch_text)
    new_batch.append(new_annotations)
    return new_batch
            
            
            

In [21]:
def minibatch(train_data, size):
    batches = []
    index = 0
    for batch_size in size: 
        batch = create_batch(train_data[index:index+batch_size])
        batches.append(batch)
        index += batch_size
    return batches

In [22]:

def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 5, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, len(train_data))
    batches = minibatch(train_data, size=batch_size)
    return batches

### TRAIN

In [26]:
import random
import spacy
model = None
n_iter=1

if model is not None:
    nlp1 = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp1 = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

#create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp1.pipe_names:
    print("Add ner pipe")
    ner = nlp1.create_pipe('ner')
    nlp1.add_pipe(ner, last=True)
# otherwise, get it so we can add labels

else:
    ner = nlp1.get_pipe('ner')


Created blank 'en' model
Add ner pipe


In [27]:
# add labels, Trains data based on annotations 
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [28]:
def getTime(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    time_since_start = "Time:  {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
    return time_since_start

In [29]:
def log(start,i,len_):
    tenth = int(len_/20)
    if(i % tenth == 0):
        percent = int(round((i/len_*10),0))
        time_ = getTime(start,time.time())
        print("0%" + "=" *percent + str(percent*10) + "%, " + time_, end="\r" )


In [30]:
from tqdm import tqdm
from spacy.util import decaying
import time
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp1.pipe_names if pipe != 'ner']
dropout = decaying(0.6, 0.2, 1e-4)

with nlp1.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp1.begin_training()
    optimizer.alpha = 0.0001
    losses = {}
    for itn in range(n_iter):
        
        random.shuffle(TRAIN_DATA)
        drop = next(dropout)
        batches  = [x[0] for x  in get_batches(TRAIN_DATA, "ner") if len(x) == 1]
        start = time.time()
        for i,data in enumerate(TRAIN_DATA):
            log(start,i,len(TRAIN_DATA))
            text, annotations = data
            nlp1.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout 
                sgd=optimizer,  # callable to update weights
                losses=losses)
            
        print(losses)

{'ner': 6387.123207599274}0:01:45.59


# EVALUATION 

In [31]:
# test the trained model
for text,entities in EVAL_DATA:
    doc = nlp1(text)
    print(entities.values())
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

dict_values([[(100, 107, 'location')]])
Entities []
dict_values([[(54, 67, 'location')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(77, 99, 'group'), (102, 105, 'group')]])
Entities [('Avalanche Rescue Teams (', 'location')]
dict_values([[(161, 181, 'person')]])
Entities []
dict_values([[(33, 45, 'location')]])
Entities []
dict_values([[(11, 14, 'group'), (50, 62, 'location')]])
Entities []
dict_values([[(11, 14, 'group')]])
Entities []
dict_values([[(164, 176, 'person')]])
Entities []
dict_values([[(6, 30, 'creative-work')]])
Entities []
dict_values([[(83, 92, 'location')]])
Entities []
dict_values([[(99, 103, 'group')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(33, 41, 'location')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(4, 8, 'group')]])
Entities []
dict_values([[(19, 23, 'group')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(4, 45, 'group'), (48, 52, 'group')]

dict_values([[(15, 20, 'person'), (36, 43, 'person')]])
Entities []
dict_values([[(47, 56, 'location'), (60, 67, 'location')]])
Entities []
dict_values([[(21, 24, 'group')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(16, 22, 'corporation')]])
Entities []
dict_values([[(23, 29, 'corporation')]])
Entities []
dict_values([[(19, 28, 'person')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(32, 35, 'group')]])
Entities []
dict_values([[(16, 32, 'person'), (40, 42, 'group')]])
Entities []
dict_values([[(7, 11, 'corporation')]])
Entities []
dict_values([[(82, 87, 'person')]])
Entities []
dict_values([[(60, 71, 'location')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_

dict_values([[(193, 200, 'product'), (265, 272, 'product'), (335, 344, 'location')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(70, 78, 'product')]])
Entities []
dict_values([[(10, 25, 'creative-work'), (35, 43, 'product')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(6, 9, 'product')]])
Entities []
dict_values([[(15, 26, 'product'), (36, 46, 'product'), (68, 72, 'product'), (123, 134, 'product'), (201, 205, 'product')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(9, 15, 'person'), (23, 29, 'person'), (35, 41, 'person')]])
Entities []
dict_values([[(57, 74, 'corporation')]])
Entities []
dict_values([[(30, 43, 'location')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(17, 32, 'person')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(78, 81, 'creative-work'), (84, 97, 'crea

dict_values([[(36, 41, 'corporation')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(42, 66, 'group')]])
Entities []
dict_values([[(4, 12, 'location'), (39, 51, 'group')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(5, 14, 'creative-work'), (37, 55, 'creative-work')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(0, 9, 'person')]])
Entities [('Minkowski Space Integral - Concerns about Wick Rotations', 'corporation')]
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(25, 35, 'creative-work')]])
Entities []
dict_values([[(27, 42, 'product')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(0, 4, 'group'), (46, 55, 'group')]])
Entities []
dict_values(

dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(14, 23, 'creative-work'), (27, 40, 'person')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(0, 15, 'creative-work')]])
Entities []
dict_values([[(36, 48, 'person'), (51, 60, 'location')]])
Entities []
dict_values([[(5, 13, 'person'), (38, 45, 'corporation')]])
Entities []
dict_values([[(10, 15, 'person'), (45, 50, 'person'), (107, 113, 'location')]])
Entities []
dict_values([[(0, 52, 'creative-work')]])
Entities []
dict_values([[(20, 25, 'person'), (91, 109, 'group')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(52, 76, 'creative-work'), (79, 90, 'person'), (97, 104, 'creative-work'), (110, 

dict_values([[(23, 40, 'group')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(2, 5, 'corporation')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(5, 10, 'person'), (54, 64, 'person')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(16, 18, 'person'), (113, 133, 'product')]])
Entities []
dict_values([[(5, 20, 'person'), (53, 61, 'creative-work'), (66, 70, 'corporation'), (73, 86, 'person')]])
Entities []
dict_values([[(65, 86, 'creative-work')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(43, 50, 'location')]])
Entities []
dict_values([[(2, 7, 'person'), (62, 67, 'person')]])
Entities []
dict_values([[(55, 70, 'person')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(15, 23, 'product')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(5, 14, 'person')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(12, 35, 'creative-work'), (43, 57, 'person'), (61, 66, 'person')]])
Entities []
dict_values([[(18, 25, 'p

dict_values([[(0, 16, 'creative-work')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(5, 10, 'person'), (13, 18, 'person')]])
Entities []
dict_values([[(5, 19, 'person')]])
Entities []
dict_values([[(45, 55, 'person')]])
Entities []
dict_values([[(25, 32, 'person')]])
Entities []
dict_values([[]])
Entities []
dict_values([[]])
Entities []
dict_values([[(5, 13, 'group'), (24, 37, 'group'), (42, 54, 'group'), (61, 75, 'group'), (88, 92, 'location')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(18, 32, 'person'), (35, 45, 'creative-work')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(40, 52, 'person'), (59, 66, 'person')]])
Entities []
dict_values([[(32, 42, 'location'), (45, 53, 'creative-work')]])
Entities []
dict_values([[]])
Entities []
dict_values([[(5, 16, 'creative-work')]])
Entities []
dict_values([[(2, 11, 'corporation')]])
Entities []
dict_values([[(18, 22, 'person'), (38, 48, 'g

In [32]:
import stanza
from spacy_stanza import StanzaLanguage

snlp = stanza.Pipeline(lang="en",processors='tokenize,ner')
nlp1 = StanzaLanguage(snlp)


2020-05-27 16:48:43 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| ner       | ontonotes |

2020-05-27 16:48:44 INFO: Use device: cpu
2020-05-27 16:48:44 INFO: Loading: tokenize
2020-05-27 16:48:44 INFO: Loading: ner
2020-05-27 16:48:45 INFO: Done loading processors!


In [33]:
def filter_entities(entities):
    ent = entities['entities']
    new_ents = [x for x in ent if x[2] in ['location','person','group','creative-work'
                                           ,'product']]
    entities['entities'] = new_ents
    return entities

In [34]:
def stanf_to_conLL(label):
    if (label == 'GPE' or label=='FAC'):
        return 'location'
    elif(label=='PERSON'):
        return 'person'
    elif(label=='PRODUCT'):
        return 'product'    
    elif(label=='ORG' or label=='NORP'):
        return 'group'
    elif(label=='WORK_OF_ART'):
        return ' creative-work'
 

In [35]:
fp_global = 0 
fn_global = 0 
tp_global = 0 
stanford  = True
accepted_ents = ['GPE', 'PERSON','ORG','FAC','WORK_OF_ART','NORP','PRODUCT']
for text,entities in EVAL_DATA[0:1]:
    ent_as_list = list(entities.values())[0]
    if(stanford):
        entities = filter_entities(entities)
        #correct_entities = [(x[0],x[1],stanf_to_conLL(x[2])) for x in ent_as_list]
    correct_entities = ent_as_list
    fp = 0 
    fn = 0 
    tp = 0 
    print('correct entities')
    print(correct_entities)
    correct_text = [text[y[0]:y[1]] for y in correct_entities]
    print(correct_text)
    doc = nlp1(text)
    predicted_entities = [(ent.text, ent.label_) for ent in doc.ents]
    predicted_entities = [x for x in predicted_entities if x[1] in accepted_ents]
    correctly_predicted = 0 
    for i,data in enumerate(predicted_entities): 
        predicted_text, predicted_label = data 
        #predicted_label = stanf_to_conLL(predicted_label)
        print(predicted_text)
        print(predicted_label)
        predicted_label = stanf_to_conLL(predicted_label)
        if(predicted_text in correct_text):
            index_label = correct_text.index(predicted_text)
            if(predicted_label == correct_entities[index_label][2]):
                        tp += 1
                        correctly_predicted += 1 
        else: 
            fp += 1
            
    fn += (len(correct_text) - correctly_predicted)
    
    fp_global += fp
    fn_global += fn
    tp_global += tp 
    
    print("FN: " + str(fn) + " FP: " + str(fp) + " TP: " + str(tp))
    print("FP_GLOBAL: " + str(fp_global))  
    print("FN_GLOBAL: " + str(fn_global))  
    print("TP_GLOBAL: " + str(tp_global))  
    


correct entities
[(100, 107, 'location')]
['Sonmarg']
Sonmarg
GPE
FN: 0 FP: 0 TP: 1
FP_GLOBAL: 0
FN_GLOBAL: 0
TP_GLOBAL: 1


In [36]:
precision = tp_global/(tp_global + fp_global) 
recall= tp_global/ (tp_global + fn_global)
f1_score = 2* ((precision*recall)/(precision+recall))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1 Score: " + str(f1_score))


Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# FLAIR

In [73]:


# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = SequenceTagger.load('ner-ontonotes')

# run NER over sentence
tagger.predict(sentence)

2020-05-27 12:37:38,930 loading file /home/c3dric/.flair/models/en-ner-ontonotes-v0.4.pt


[Sentence: "I love Berlin ."   [− Tokens: 4  − Token-Labels: "I love Berlin <S-GPE> ."]]

In [74]:
print(sentence)
print('The following NER tags are found:')

# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)

Sentence: "I love Berlin ."   [− Tokens: 4  − Token-Labels: "I love Berlin <S-GPE> ."]
The following NER tags are found:
Span [3]: "Berlin"   [− Labels: GPE (0.9763)]


In [75]:
%%time
sentence = Sentence(EVAL_DATA[1][0])
tagger.predict(sentence)

CPU times: user 3.91 s, sys: 20.8 ms, total: 3.93 s
Wall time: 2.18 s


[Sentence: "& gt ; * Police last week evacuated 80 villagers from Waltengoo Nar where dozens were killed after a series of avalanches hit the area in 2005 in the south of the territory ."   [− Tokens: 34  − Token-Labels: "& gt ; * Police last <B-DATE> week <E-DATE> evacuated 80 <S-CARDINAL> villagers from Waltengoo <B-GPE> Nar <E-GPE> where dozens <S-CARDINAL> were killed after a series of avalanches hit the area in 2005 <S-DATE> in the south of the territory ."]]

In [78]:
sentence = Sentence(EVAL_DATA[1][0])

dict_

{'text': '& gt ; * Police last week evacuated 80 villagers from Waltengoo Nar where dozens were killed after a series of avalanches hit the area in 2005 in the south of the territory .',
 'labels': [],
 'entities': [{'text': 'last week',
   'start_pos': 16,
   'end_pos': 25,
   'labels': [DATE (0.8501)]},
  {'text': '80',
   'start_pos': 36,
   'end_pos': 38,
   'labels': [CARDINAL (0.9999)]},
  {'text': 'Waltengoo Nar',
   'start_pos': 54,
   'end_pos': 67,
   'labels': [GPE (0.9608)]},
  {'text': 'dozens',
   'start_pos': 74,
   'end_pos': 80,
   'labels': [CARDINAL (0.9933)]},
  {'text': '2005',
   'start_pos': 138,
   'end_pos': 142,
   'labels': [DATE (0.9995)]}]}

In [115]:
flair_to_spacy(dict_)

GPE


[(33, 38, 'location')]

In [98]:
def flair_to_spacy(dict_):
    res = []
    entities = dict_['entities']
    for ent in entities:
        text = ent['text']
        start_pos = ent['start_pos']
        end_pos = ent['end_pos']        
        label = ent['labels'][0].value
        if (label in accepted_ents):
            label = stanf_to_conLL(label)
            res.append((start_pos,end_pos,label))
    return res

In [109]:
tagger = SequenceTagger.load('ner-ontonotes')


2020-05-27 13:01:13,844 loading file /home/c3dric/.flair/models/en-ner-ontonotes-v0.4.pt


In [31]:
from tqdm.notebook import tqdm



In [134]:
%%time
from flair.data import Sentence
from flair.models import SequenceTagger
fp_global = 0 
fn_global = 0 
tp_global = 0 
stanford  = False
flair = True
accepted_ents = ['GPE', 'PERSON','ORG','FAC','WORK_OF_ART','NORP','PRODUCT']
for text,entities in tqdm(EVAL_DATA):
    fp = 0 
    fn = 0 
    tp = 0 

    entities = filter_entities(entities)
    sentence = Sentence(text)
    tagger.predict(sentence)
    dict_ = (sentence.to_dict(tag_type='ner'))
    predicted_entities = flair_to_spacy(dict_)
        
    ent_as_list = list(entities.values())[0]
    correct_entities = ent_as_list
    correct_text = [text[y[0]:y[1]] for y in correct_entities]
    correctly_predicted = 0 
    correct_entities = set(correct_entities)
    predicted_entities = set(predicted_entities)
    tp = len(correct_entities.intersection(predicted_entities))
    fp = len(predicted_entities - correct_entities)
    fn = len(correct_entities - predicted_entities)
    
    fp_global += fp
    fn_global += fn
    tp_global += tp 
    
    print("FN: " + str(fn_global) + " FP: " + str(fp_global) + " TP: " + str(tp_global), end="\r" )


HBox(children=(FloatProgress(value=0.0, max=1287.0), HTML(value='')))

FN: 706 FP: 296 TP: 307
CPU times: user 57min 32s, sys: 11.7 s, total: 57min 44s
Wall time: 31min 2s


# TagMe

In [33]:
import tagme




In [34]:
tagme.GCUBE_TOKEN = "edcf25a7-c492-49b8-b80e-bbee015fc687-843339462"


In [25]:
%%time
lunch_annotations = tagme.annotate("My favourite meal is Mexican burritos.")


CPU times: user 14.8 ms, sys: 8.42 ms, total: 23.2 ms
Wall time: 843 ms


In [26]:
for ann in lunch_annotations.get_annotations(0.1):
    print(ann)

meal -> Meal (score: 0.2014230340719223)
Mexican -> Mexican cuisine (score: 0.36614900827407837)
burritos -> Burrito (score: 0.28607892990112305)


In [27]:
%%time
tomatoes_mentions = tagme.mentions("I definitely like ice cream better than tomatoes.")

for mention in tomatoes_mentions.mentions:
    print(mention)

ice cream [18,27] lp=0.18749085068702698
tomatoes [40,48] lp=0.004235605709254742
CPU times: user 18.5 ms, sys: 0 ns, total: 18.5 ms
Wall time: 408 ms


In [28]:
# Get relatedness between a pair of entities specified by title.
rels = tagme.relatedness_title(("Barack Obama", "Donald Trump"))
print("Obama and italy have a semantic relation of", rels.relatedness[0].rel)

Obama and italy have a semantic relation of 0.53215491771698


In [32]:
for text,entities in tqdm(EVAL_DATA[0:10]):
    ent_as_list = list(entities.values())[0]
    correct_entities = ent_as_list
    correct_text = [text[y[0]:y[1]] for y in correct_entities]
    lunch_annotations = tagme.annotate(text)
    print(text)
    print(correct_text)
    for ann in lunch_annotations.get_annotations(0.1):
        print(ann)
    print("-----------------")

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

& gt ; * The soldier was killed when another avalanche hit an army barracks in the northern area of Sonmarg , said a military spokesman .
['Sonmarg']
gt -> Tonne (score: 0.11385387182235718)
soldier -> United States Army (score: 0.12458904832601547)
avalanche -> Allied invasion of Italy (score: 0.23905113339424133)
army -> United States Army (score: 0.1549888253211975)
army barracks -> Barracks (score: 0.23123230040073395)
barracks -> Barracks (score: 0.2525090277194977)
Sonmarg -> Sonamarg (score: 0.5)
-----------------
& gt ; * Police last week evacuated 80 villagers from Waltengoo Nar where dozens were killed after a series of avalanches hit the area in 2005 in the south of the territory .
['Waltengoo Nar']
series of avalanches -> 2010 Salang avalanches (score: 0.25023046135902405)
avalanches -> Avalanche (score: 0.2725068926811218)
-----------------
& gt ; * The army on Thursday recovered the bodies of ten of its men who were killed in an avalanche the previous day .
[]
Thursday ->

In [None]:
%%time
tomatoes_mentions = tagme.mentions("I definitely like ice cream better than tomatoes.")

for mention in tomatoes_mentions.mentions:
    print(mention)

In [36]:
for text,entities in tqdm(EVAL_DATA[0:10]):
    ent_as_list = list(entities.values())[0]
    correct_entities = ent_as_list
    correct_text = [text[y[0]:y[1]] for y in correct_entities]
    lunch_annotations = tagme.annotate(text)
    print(text)
    print(correct_text)
    tomatoes_mentions = tagme.mentions(text)
    print(tomatoes_mentions)
    for mention in tomatoes_mentions.mentions:
        print(mention)
    print("---------------")

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

& gt ; * The soldier was killed when another avalanche hit an army barracks in the northern area of Sonmarg , said a military spokesman .
['Sonmarg']
2msec, 12 mentions
gt [2,4] lp=0.034230880439281464
soldier [13,20] lp=0.03945669159293175
killed [25,31] lp=0.002261004177853465
avalanche [45,54] lp=0.21192322671413422
hit [55,58] lp=0.01779121533036232
army [62,66] lp=0.029292907565832138
army barracks [62,75] lp=0.023383768275380135
barracks [67,75] lp=0.06593720614910126
northern area [83,96] lp=0.011247443966567516
Sonmarg [100,107] lp=1.0
military [117,125] lp=0.022062312811613083
spokesman [126,135] lp=0.014206210151314735
---------------
& gt ; * Police last week evacuated 80 villagers from Waltengoo Nar where dozens were killed after a series of avalanches hit the area in 2005 in the south of the territory .
['Waltengoo Nar']
2msec, 16 mentions
gt [2,4] lp=0.034230880439281464
Police [9,15] lp=0.03883015364408493
last [16,20] lp=0.002006721217185259
last week [16,25] lp=0.00160