## Named Entity Recognition
Data is taken from https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

Some reference:
* https://www.aclweb.org/anthology/W03-0423.pdf, 
* https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2, 
* https://mattshomepage.com/articles/2016/May/23/nltk_nec/

In [17]:
import os
import sys

import nltk
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook
tqdm.pandas ()
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, precision_recall_fscore_support

In [36]:
path_data = os.path.abspath ('../../data/ukdw-2')
data = pd.read_csv (os.path.join (path_data, 'ner_sample.csv'), encoding='latin')

# fill the NaN value with the previously seen value
data["Sentence #"] = data["Sentence #"].fillna (method="ffill")
data.head (5)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 2,Families,NNS,O
1,Sentence: 2,of,IN,O
2,Sentence: 2,soldiers,NNS,O
3,Sentence: 2,killed,VBN,O
4,Sentence: 2,in,IN,O


In [4]:
print ("Unique Tag")
print ("\n".join (sorted (set (data["Tag"]))))

Unique Tag
B-art
B-eve
B-geo
B-gpe
B-nat
B-org
B-per
B-tim
I-art
I-eve
I-geo
I-gpe
I-nat
I-org
I-per
I-tim
O


In [None]:
"""
# if you want to analyze person only
data['Tag'] = data['Tag'].apply (lambda f: f if f in ('B-per', 'I-per') else 'O')
print ("Unique Tag")
print ("\n".join (sorted (set (data["Tag"]))))
"""

In [5]:
# get prev word
data['prev_word'] = data['Word'].shift (periods=1, fill_value='__start__')

# add __start__ for each sentence start
sentence_list = data['Sentence #'].tolist ()
prev_word = data['prev_word'].tolist ()
for idx in tqdm (range (1, len (sentence_list))):
    if sentence_list[idx-1] != sentence_list[idx]:
        prev_word[idx] = '__start__'

# reupdate prev_word
data['prev_word'] = prev_word

100%|██████████| 119503/119503 [00:00<00:00, 1486933.53it/s]


In [6]:
# extract feature
def extract(d):
    feature = {
        "word" : d["Word"]    
    }
    return feature
data["feature"] = data.apply (extract, axis=1)
data.head (5)

Unnamed: 0,Sentence #,Word,POS,Tag,prev_word,feature
0,Sentence: 2,Families,NNS,O,__start__,{'word': 'Families'}
1,Sentence: 2,of,IN,O,Families,{'word': 'of'}
2,Sentence: 2,soldiers,NNS,O,of,{'word': 'soldiers'}
3,Sentence: 2,killed,VBN,O,soldiers,{'word': 'killed'}
4,Sentence: 2,in,IN,O,killed,{'word': 'in'}


In [7]:
# create a maxent bayes classifier
# classifier = nltk.classify.MaxentClassifier.train(data[["feature", "Tag"]].values)

# create a naive bayes classifier
classifier = nltk.classify.NaiveBayesClassifier.train(data[["feature", "Tag"]].values)

In [8]:
# most important feature
classifier.show_most_informative_features (20)

Most Informative Features
                    word = 'President'     B-per : O      =    609.7 : 1.0
                    word = 'French'        B-gpe : O      =    543.5 : 1.0
                    word = 'Pakistan'      B-geo : O      =    516.2 : 1.0
                    word = 'Union'         I-org : O      =    390.3 : 1.0
                    word = 'the'               O : B-tim  =    332.7 : 1.0
                    word = 'Ministry'      I-org : O      =    279.9 : 1.0
                    word = 'Prime'         B-per : O      =    262.2 : 1.0
                    word = 'U.S.'          B-geo : B-per  =    247.2 : 1.0
                    word = 'York'          I-geo : O      =    205.0 : 1.0
                    word = 'General'       B-per : O      =    162.3 : 1.0
                    word = 'American'      B-gpe : O      =    144.7 : 1.0
                    word = '2004'          I-tim : O      =    142.3 : 1.0
                    word = 'King'          B-per : O      =    141.6 : 1.0

In [9]:
word = {"word" : "Obama"}
classifier.classify (word)

'I-per'

In [10]:
def extract_ner (text):
    for t in text.split ():
        feat = {"word" : t}
        print ("{}/{}".format (t, classifier.classify (feat)), end=' ')

extract_ner ("President Joko Widodo visit Netherlands for discussing global warming")

President/B-per Joko/O Widodo/O visit/O Netherlands/B-geo for/O discussing/O global/O warming/O 

In [12]:
data['predicted'] = data['feature'].apply (classifier.classify)

In [28]:
# various matrices
accuracy = accuracy_score (data['Tag'], data['predicted'])
labels = list (set (data['Tag']))
precision, recall, fscore, support = precision_recall_fscore_support (data['Tag'], data['predicted'], labels=labels)

In [34]:
prec = list (zip (labels, recall))
prec

[('I-tim', 0.024279210925644917),
 ('B-tim', 0.7874648453194054),
 ('O', 0.9985449450312567),
 ('B-org', 0.3798348544111256),
 ('B-art', 0.0),
 ('I-nat', 0.0),
 ('I-eve', 0.0),
 ('B-per', 0.8242857142857143),
 ('I-art', 0.0),
 ('I-per', 0.8811494611900672),
 ('B-nat', 0.0),
 ('I-gpe', 0.0),
 ('B-gpe', 0.9204604918890633),
 ('I-org', 0.3517915309446254),
 ('B-eve', 0.0),
 ('I-geo', 0.43258426966292135),
 ('B-geo', 0.893049499871762)]