# Load Data

In [None]:
! mkdir ../data
! cd ../data; wget https://raw.githubusercontent.com/leondz/emerging_entities_17/master/wnut17train.conll
! cd ../data; wget https://raw.githubusercontent.com/leondz/emerging_entities_17/master/emerging.test.annotated    

In [2]:
wnut_path = "../data/wnut17train.conll"

In [3]:
test_path = "../data/emerging.test.annotated"

In [5]:
import pandas as pd
import csv
df = pd.read_csv(wnut_path,sep="\t")
df_test = pd.read_csv(test_path, header = None, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',sep="\t")

In [6]:
labels

{'corporation', 'creative-work', 'group', 'location', 'person', 'product'}

## Utility functions to construct dataset

In [117]:
import nltk
from nltk.tokenize import sent_tokenize 

def get_labels(dataframe):
    """Get all labels from a dataframe, is only applicable to the wnut2017 dataset"""
    labels = []
    for i,row in df.iterrows():
        if(row.O != 'O' and (not isinstance(row.O,float))):
            labels.append(row.O)
    labels = [x[2:len(x)] for x in labels]
    labels = set(labels)
    return labels

def read_file(path):
    """Reads a file line by line and returns a list where each entry corresponds to a line in the file
    
    Parameters
    ----------
    path: path of the file to read
    """
    with open(path) as f:
        content = f.readlines()
    return content

def read_file_and_split_lines(path):
    """
    Reads a file per line at a given path, replaces '\t' with " " and converts each line from 
    string to a list of words
    
    Parameters
    ----------
    path: path of the file to read
    """
    content = [x.replace("\t", " ") for x in read_file(path)]
    text = [x.split() for x in content]
    return text

def wnut_17_to_spacy_format(list_labels):
    """
    The Wnut-017 dataset is in B-I format we need to combine those entries to one single entry. 
    This function will do exactly this. 
    Example: 
    (14, 'B', 'location'), (15, 'I', 'location'), (16, 'I', 'location'), (18, 'B', 'location')
    Will be transformed to (14,16,location),(18,location)
    
    Parameters
    ----------
    list_labels: list of labels to transform
    """
    res = []
    for i,x in enumerate(list_labels):
        j = 1
        if(i + j< len(list_labels) and x[1] == 'B' and list_labels[i  + j][1] == 'I'):
            while(i + j < len(list_labels) and list_labels[i + j][1] == 'I'):
                j += 1
            res.append((x[0],x[0] + j -1 ,x[2]))
        elif(x[1] == 'B'):
            res.append([x[0],x[2]])
    return res

def str_i_to_char_i(sent_label):
    print(sent_label)
    """ Takes as arguments a sentence and a list of entities whith the corresponding indexes at word level, 
    and returns the same sentence combined with the indexes of the entities at character level. 
    
    Parameters
    ----------
    sent_label: sentence with entities at word level 
    
    """
    res_labels = []
    sent = sent_label[0].split()
    labels = sent_label[1]
    
    for label in labels: 
        if(len(label) == 2):
            start_label_index = label[0]
            end_label_index = label[0]
            label_name = label[1]
        elif(len(label) == 3):
            start_label_index = label[0]
            end_label_index = label[1]
            label_name = label[2]
        

        start_char_index = sum([len(x) for x in sent[0:start_label_index]])    
        end_char_index = start_char_index + sum([len(x) for x in sent[start_label_index:end_label_index + 1]])
        res_labels.append((start_char_index + start_label_index,end_char_index + end_label_index,label_name))
        
    res = [" ".join(sent), res_labels]
    print(res)
    return 

def to_spacy_format(sent_label): 
    """Takes as arguments a sentence and a list of entities, creates a dict with a single entry: the entities.
    returns a list of the sentence and the created"""
    sent = sent_label[0]
    labels = sent_label[1]
    res_dict = {'entities': labels}
    return [sent, res_dict]

def create_entities_char_level(text):
    res = []
    sentence = []
    sent_labels = []
    for word_entity_pair in text:
        if(len(word_entity_pair)!=0):
            sentence.append(word_entity_pair[0])
            sent_labels.append(word_entity_pair[1])
        else:
            sent_labels = [(i,x[0], x[2:len(x)]) for i,x in enumerate(sent_labels) if x != 'O' ]
            sent_labels = wnut_17_to_spacy_format(sent_labels)
            res.append([" ".join(sentence),sent_labels])
            sent_labels = []
            sentence = []
        
    res = [str_i_to_char_i(x) for x in res]
    res = [to_spacy_format(x) for x in res]
    return res

Show which entitities we are going to predict

In [None]:
labels = get_labels(df)
labels

Stats about the dataset

In [14]:
from collections import defaultdict 
dict_labels = defaultdict(int)
for x in EVAL_DATA:
    entitites = x[1]['entities']
    for y in entitites: 
        label = y[2]
        dict_labels[label] += 1
dict_labels

defaultdict(int,
            {'location': 150,
             'group': 165,
             'person': 429,
             'creative-work': 142,
             'corporation': 66,
             'product': 127})

In [114]:
text_train = read_file_and_split_lines(wnut_path)
text_test = read_file_and_split_lines(test_path)

In [118]:
TRAIN_DATA = create_entities_char_level(text_train)
EVAL_DATA = create_entities_char_level(text_test)

["@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening .", [(14, 16, 'location'), [18, 'location']]]
["@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening .", [(64, 85, 'location'), (88, 91, 'location')]]
['From Green Newsfeed : AHFA extends deadline for Sage Award to Nov . 5 http://tinyurl.com/24agj38', [[4, 'group']]]
['From Green Newsfeed : AHFA extends deadline for Sage Award to Nov . 5 http://tinyurl.com/24agj38', [(22, 26, 'group')]]
['Pxleyes Top 50 Photography Contest Pictures of August 2010 ... http://bit.ly/bgCyZ0 #photography', [[0, 'corporation']]]
['Pxleyes Top 50 Photography Contest Pictures of August 2010 ... http://bit.ly/bgCyZ0 #photography', [(0, 7, 'corporation')]]
['today is my last day at the office .', []]
['today is my last day at the office .', []]
["4Dbling 's place til monday , party party party . &lt; 3", [[0

['I liked a YouTube video -- Testing Tomorrow http://youtu.be/aOYInsel7Qw?a', [[3, 'corporation'], (6, 7, 'group')]]
['I liked a YouTube video -- Testing Tomorrow http://youtu.be/aOYInsel7Qw?a', [(10, 17, 'corporation'), (27, 43, 'group')]]
["I love to see a child 's face when they think of the fairies . Like a fairy their little faces light up :)", []]
["I love to see a child 's face when they think of the fairies . Like a fairy their little faces light up :)", []]
["The forced ads in ngmoco games are outstandingly annoying ! Looks like it 's time to delete all their apps from my iOS devices !", [[4, 'corporation'], [22, 'product']]]
["The forced ads in ngmoco games are outstandingly annoying ! Looks like it 's time to delete all their apps from my iOS devices !", [(18, 24, 'corporation'), (115, 118, 'product')]]
['RT @LightCMS : Day 6 : Win 1 of 10 iPads from @LightCMS - RT to enter or read more http://speaklight.com/iPad', []]
['RT @LightCMS : Day 6 : Win 1 of 10 iPads from @LightCM

['Seba Lecompte is playing @ #Kerk #Ghent #Belgium , Sat 11 Apr 2015 #gigs', [(0, 1, 'person')]]
['Seba Lecompte is playing @ #Kerk #Ghent #Belgium , Sat 11 Apr 2015 #gigs', [(0, 13, 'person')]]
['RT @JUCOFFrenzy : JUCO RB @RealSteel223526 was named 1st Team Offense @njcaa All American http://t.co/Ep9U2tsnS3', []]
['RT @JUCOFFrenzy : JUCO RB @RealSteel223526 was named 1st Team Offense @njcaa All American http://t.co/Ep9U2tsnS3', []]
['Can someone come over tomorrow and make Christmas cookies with me ?', []]
['Can someone come over tomorrow and make Christmas cookies with me ?', []]
["Your fiscal condition may be on the upswing , but don't let thi ... More for Scorpio http://t.co/z0qLFBXQnL", []]
["Your fiscal condition may be on the upswing , but don't let thi ... More for Scorpio http://t.co/z0qLFBXQnL", []]
['Strongly not looking forward to cleaning tomorrow', []]
['Strongly not looking forward to cleaning tomorrow', []]
["the 23rd of january 2014 was the worst thing ever &amp; look 

TypeError: 'NoneType' object is not subscriptable

# TRAIN

In [15]:
import random
import spacy
model = None
n_iter=30

if model is not None:
    nlp1 = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp1 = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

#create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp1.pipe_names:
    print("Add ner pipe")
    ner = nlp1.create_pipe('ner')
    nlp1.add_pipe(ner, last=True)
# otherwise, get it so we can add labels

else:
    ner = nlp1.get_pipe('ner')


Created blank 'en' model
Add ner pipe


In [16]:
# add labels, Trains data based on annotations 
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [17]:
def getTime(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    time_since_start = "Time:  {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
    return time_since_start

In [18]:
def log(start,i,len_):
    tenth = int(len_/20)
    if(i % tenth == 0):
        percent = int(round((i/len_*10),0))
        time_ = getTime(start,time.time())
        print("0%" + "=" *percent + str(percent*10) + "%, " + time_, end="\r" )

In [50]:
from tqdm import tqdm
from spacy.util import decaying
import time
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp1.pipe_names if pipe != 'ner']
dropout = decaying(0.6, 0.2, 1e-4)
n_iter=4

with nlp1.disable_pipes(*other_pipes):  # only train NER
    #optimizer = nlp1.begin_training()
    optimizer.alpha = 0.0001
    losses = {}
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        drop = next(dropout)
        start = time.time()
        for i,data in enumerate(TRAIN_DATA):
            log(start,i,len(TRAIN_DATA))
            text, annotations = data
            nlp1.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout 
                sgd=optimizer,  # callable to update weights
                losses=losses)
            
        print(losses)

{'ner': 4030.179368534605}0:01:58.74
{'ner': 8090.263954239019}0:01:56.44
{'ner': 12081.728638740553}:01:53.49
{'ner': 16083.479693405628}:02:08.47


# EVALUATION 

In [20]:
def stanf_to_conLL(label):
    if (label == 'GPE' or label=='FAC'):
        return 'location'
    elif(label=='PERSON'):
        return 'person'
    elif(label=='PRODUCT'):
        return 'product'    
    elif(label=='ORG' or label=='NORP'):
        return 'group'
    elif(label=='WORK_OF_ART'):
        return ' creative-work'

In [21]:
def stanf_to_conLL(label):
    if (label == 'GPE' or label=='FAC'):
        return 'location'
    elif(label=='PERSON'):
        return 'person'
    elif(label=='PRODUCT'):
        return 'product'    
    elif(label=='ORG' or label=='NORP'):
        return 'group'
    elif(label=='WORK_OF_ART'):
        return ' creative-work'

In [22]:
def filter_entities(entities):
    ent = entities['entities']
    new_ents = [x for x in ent if x[2] in ['location','person','group','creative-work'
                                           ,'product']]
    entities['entities'] = new_ents
    return entities

In [23]:
def flair_to_spacy(dict_):
    res = []
    entities = dict_['entities']
    for ent in entities:
        text = ent['text']
        start_pos = ent['start_pos']
        end_pos = ent['end_pos']        
        label = ent['labels'][0].value
        if (label in accepted_ents):
            label = stanf_to_conLL(label)
            res.append((start_pos,end_pos,label))
    return res

In [40]:
import operator
def add_tuple(t1,t2):
    return tuple(map(operator.add, t1,t2))

In [26]:
import stanza
from flair.models import SequenceTagger
from spacy_stanza import StanzaLanguage
import spacy spacy.prefer_gpu()

snlp = stanza.Pipeline(lang="en",processors='tokenize,ner')
nlp_stanford = StanzaLanguage(snlp)    
tagger = SequenceTagger.load('ner-ontonotes')
accepted_ents = ['GPE', 'PERSON','ORG','FAC','WORK_OF_ART','NORP','PRODUCT']


2020-06-02 15:53:40 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| ner       | ontonotes |

2020-06-02 15:53:40 INFO: Use device: cpu
2020-06-02 15:53:40 INFO: Loading: tokenize
2020-06-02 15:53:40 INFO: Loading: ner
2020-06-02 15:53:40 INFO: Done loading processors!


2020-06-02 15:53:40,877 loading file /home/c3dric/.flair/models/en-ner-ontonotes-v0.4.pt


In [75]:
def evaluate_entitities(predicted_entities, correct_entities):
    correct_entities = set(correct_entities)
    predicted_entities = set(predicted_entities)    
    tp = len(correct_entities.intersection(predicted_entities))
    fp = len(predicted_entities - correct_entities)
    fn = len(correct_entities - predicted_entities)
    return fp,fn,tp

In [86]:
def evaluate(score):
    fp_global = score[0]
    fn_global = score[1]
    tp_global = score[2]
    
    # Avoid division by 0
    if(tp_global + fp_global ==0):
        precision =0
    else:
        precision = tp_global/(tp_global + fp_global) 
    
    # Avoid division by 0
    if(tp_global + fn_global ==0):
        recall = 0 
    else:
        recall= tp_global/ (tp_global + fn_global)
    
    # Avoid division by 0
    if(precision+recall == 0):
        f1_score = 0
    else:    
        f1_score = 2* ((precision*recall)/(precision+recall))
        
    result = defaultdict(float)
    result['recall'] = round(recall,2)
    result['f1'] = round(f1_score,2)
    result['precision'] = round(precision,2)
    return result

In [97]:
from flair.data import Sentence
from tqdm.notebook import tqdm
from collections import defaultdict
# Index 0 = fp, 1= fn and 2 = tp
eval_spacy = (0,0,0)
eval_stanza = (0,0,0)
eval_flair = (0,0,0)

for text,entities in tqdm(EVAL_DATA[0:5]):
    #Spacy Prediction
    spacy_stanford_correct_ents = [(text[x[0]:x[1]], x[2]) for x in entities['entities']]
    doc = nlp1(text)
    predicted_entities = [(ent.text, ent.label_) for ent in doc.ents]
    res_spacy = evaluate_entitities(predicted_entities,spacy_stanford_correct_ents)
    eval_spacy = add_tuple(eval_spacy,res_spacy)
    
    # Stanford Prediction
    entities = filter_entities(entities)
    spacy_stanford_correct_ents = [(text[x[0]:x[1]], x[2]) for x in entities['entities']]
    doc = nlp_stanford(text)
    predicted_entities = [(ent.text, ent.label_) for ent in doc.ents]
    predicted_entities = [x for x in predicted_entities if x[1] in accepted_ents]
    predicted_entities = [(x[0],stanf_to_conLL(x[1])) for x in predicted_entities ]
    res_stanza = evaluate_entitities(predicted_entities,spacy_stanford_correct_ents)
    eval_stanza = add_tuple(eval_stanza,res_stanza)
    
    # Flair Prediction
    sentence = Sentence(text)
    tagger.predict(sentence)
    dict_ = (sentence.to_dict(tag_type='ner'))
    predicted_entities = flair_to_spacy(dict_)
    ent_as_list = list(entities.values())[0]
    correct_entities = ent_as_list
    res_flair = evaluate_entitities(predicted_entities,correct_entities)
    eval_flair = add_tuple(eval_flair,res_flair)
    
    # Evaluation
    res_spacy = evaluate(eval_spacy)
    res_stanza = evaluate(eval_stanza)
    res_flair = evaluate(eval_flair)
    
    # Print results 
    pretty_print = "F1 scores -> Spacy: {}, Stanza: {}, Flair: {}".format(res_spacy['f1'],
                                                          res_stanza['f1'],res_flair['f1']) 

    
    
    print(pretty_print, end="\r" )

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

F1 scores -> Spacy: 0.25, Stanza: 0.57, Flair: 0.75
