In [None]:
# Named Entity Recognition using CRF model
In Natural Language Processing (NLP) an Entity Recognition is one of the common problem. The entity is referred to as the part of the text that is interested in. In NLP, NER is a method of extracting the relevant information from a large corpus and classifying those entities into predefined categories such as location, organization, name and so on. 
Information about lables: 
* geo = Geographical Entity
* org = Organization
* per = Person
* gpe = Geopolitical Entity
* tim = Time indicator
* art = Artifact
* eve = Event
* nat = Natural Phenomenon

        1. Total Words Count = 1354149 
        2. Target Data Column: Tag

#### Importing Libraries

In [1]:
! pip install sklearn_crfsuite



In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [3]:
np.random.seed(321)

In [4]:
#Reading the csv file
df = pd.read_csv('./data/ner_dataset.csv', encoding = "ISO-8859-1")

In [5]:
#Display first 10 rows
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [6]:
df.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,47959,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 12812,the,NN,O
freq,1,52573,145807,887908


#### Observations : 
* There are total 47959 sentences in the dataset.
* Number unique words in the dataset are 35178.
* Total 17 lables (Tags).

In [7]:
#Displaying the unique Tags
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [8]:
#Checking null values, if any.
df.isnull().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

There are lots of missing values in 'Sentence #' attribute. So we will use pandas fillna technique and use 'ffill' method which propagates last valid observation forward to next.

In [9]:
df = df.fillna(method = 'ffill')

In [10]:
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [11]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [12]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [13]:
len(sentences)

47959

In [14]:
#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


Getting all the sentences in the dataset.

In [15]:
sentences = getter.sentences

In [16]:
len(sentences)

47959

#### Feature Preparation
These are the default features used by the NER in nltk. We can also modify it for our customization.

In [17]:
def word2features(sent, i):
    word = sent[i][0] # the current word
    postag = sent[i][1] # the POS tag of the current word

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(), # the current word (lowercase)
        'word[-3:]': word[-3:],  # 3-character suffix
        'word[-2:]': word[-2:],  # 3-character suffix
        'word.isupper()': word.isupper(), # is the word uppercase?
        'word.istitle()': word.istitle(), # is the word lowercase?
        'word.isdigit()': word.isdigit(), # it the word a number?
        'postag': postag, # the POS tag of the word
        'postag[:2]': postag[:2], # last 2 character of the POS tag of the word
    }
    if i > 0: # if not first word add features for previous word 
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else: # if first word of sentence there is no previous word
        features['BOS'] = True # beginning of sentence

    if i < len(sent)-1: # if not last word add features for next word
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True # end of sentence

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [18]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [20]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False, verbose = True)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 38367/38367 [00:11<00:00, 3311.48it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 137852
Seconds required: 3.021

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=2.49  loss=1350046.65 active=136849 feature_norm=1.00
Iter 2   time=2.50  loss=1058045.46 active=135456 feature_norm=4.40
Iter 3   time=1.26  loss=826851.07 active=129772 feature_norm=3.85
Iter 4   time=6.26  loss=453638.15 active=131222 feature_norm=3.25
Iter 5   time=1.27  loss=380487.14 active=133100 feature_norm=4.08
Iter 6   time=1.26  loss=295215.85 active=131703 feature_norm=5.87
Iter 7   time=1.27  loss=256492.12 active=124507 feature_norm=7.20
Iter 8   time=1.24  loss=228096.14 active=118624 feature_norm=8.21
Iter 9   time=1.30  loss=197310.30 active=110543 feature_nor



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100, verbose=True)

In [21]:
#Predicting on the test set.
y_pred = crf.predict(X_test)

#### Evaluating the model performance.
There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [22]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-geo',
 'B-org',
 'B-gpe',
 'B-tim',
 'B-per',
 'I-geo',
 'I-org',
 'I-per',
 'I-tim',
 'B-eve',
 'I-eve',
 'I-gpe',
 'B-nat',
 'I-nat',
 'B-art',
 'I-art']

In [23]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted', labels =labels)
print(f1_score)

0.850384474022429


In [24]:
report = flat_classification_report(y_test, y_pred)
print(report)



              precision    recall  f1-score   support

       B-art       0.35      0.09      0.15        85
       B-eve       0.54      0.40      0.46        67
       B-geo       0.86      0.91      0.88      7387
       B-gpe       0.97      0.93      0.95      3235
       B-nat       0.65      0.38      0.48        34
       B-org       0.80      0.73      0.76      4033
       B-per       0.84      0.82      0.83      3401
       B-tim       0.94      0.88      0.91      4169
       I-art       0.29      0.10      0.15        70
       I-eve       0.32      0.26      0.29        46
       I-geo       0.82      0.82      0.82      1400
       I-gpe       0.90      0.51      0.65        51
       I-nat       1.00      0.44      0.62         9
       I-org       0.80      0.81      0.80      3216
       I-per       0.85      0.89      0.87      3451
       I-tim       0.85      0.77      0.81      1374
           O       0.99      0.99      0.99    176643

    accuracy              

## Let’s check what classifier learned

In [25]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-geo  -> I-geo   8.952345
B-per  -> I-per   8.161055
B-art  -> I-art   8.046919
I-art  -> I-art   7.872114
B-tim  -> I-tim   7.772303
I-tim  -> I-tim   7.625285
I-eve  -> I-eve   7.497199
B-eve  -> I-eve   7.488832
I-geo  -> I-geo   7.314430
B-nat  -> I-nat   7.254202
B-gpe  -> I-gpe   7.054031
I-org  -> I-org   6.963610
B-org  -> I-org   6.886536
I-per  -> I-per   6.842387
I-gpe  -> I-gpe   5.869712
I-nat  -> I-nat   5.553232
O      -> B-per   4.488595
O      -> O       4.349097
O      -> B-tim   2.757242
B-per  -> B-org   2.563365

Top unlikely transitions:
B-nat  -> O       -0.321871
I-art  -> B-tim   -0.340492
I-art  -> B-geo   -0.345440
I-org  -> O       -0.414760
B-org  -> B-geo   -0.415830
B-art  -> B-geo   -0.423549
B-gpe  -> B-eve   -0.442220
I-gpe  -> O       -0.487981
B-art  -> O       -0.589518
B-tim  -> B-art   -0.702691
I-nat  -> O       -0.721194
B-gpe  -> B-art   -0.792080
B-eve  -> O       -0.830459
B-tim  -> B-gpe   -0.944603
I-tim  -> B-gpe  

Theoretically transitions O -> I should be the most unlikely, but since they never occur, the `all_possible_transitions=False` flag makes sure they get ignored.

## Check the state features

In [26]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(list(reversed(Counter(crf.state_features_).most_common()[-30:])))

Top positive:
8.435485 O        word.lower():month
7.990317 O        word.lower():last
7.266817 B-gpe    word.lower():niger
7.227985 B-org    word.lower():philippine
6.555801 B-per    word.lower():prime
6.485253 B-per    word.lower():vice
6.411785 B-gpe    word.lower():nepal
6.385589 B-org    word.lower():hamas
6.336515 B-gpe    word.lower():afghan
6.316728 B-geo    word.lower():mid-march
6.250280 O        word.lower():chairman
6.174906 B-tim    word.lower():january
6.152378 B-org    word.lower():mid-march
6.067365 B-org    word.lower():al-qaida
6.052084 O        word.lower():year
5.958665 O        word.lower():week
5.950401 B-tim    word.lower():2000
5.901075 B-tim    word.lower():february
5.858865 B-tim    +1:word.lower():week
5.853850 B-gpe    word.lower():korean
5.768011 B-per    word.lower():obama
5.765340 B-tim    word.lower():weekend
5.654652 B-geo    -1:word.lower():hamas
5.568650 B-tim    word.lower():multi-candidate
5.554584 B-tim    word.lower():one-year
5.425137 B-org    -1

In [None]:
# Exercise: Improve performance by adding/removing features

Come up with your own features to improve performance. You can add into the existing features, or remove features.

Some ideas:

- Use the prefix of the current/previous/next word.

- Use the shape of a word and other linguistic features (https://spacy.io/usage/linguistic-features)
(Tip: Store the shape of all words into a dictionary so that you do not have to invoke spaCy's method every time you encounter the same word)

- Look into nltk implementation of NER for more https://github.com/nltk/nltk/blob/42262c9a7cdcb6f44ac08aebd575b5d7bf85b6ea/nltk/chunk/named_entity.py