In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, cross_val_score
import sklearn
sklearn.__version__

'0.23.2'

In [4]:
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.ensemble import RandomForestClassifier


In [5]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")

In [6]:
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [7]:
df.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [8]:
df = df.fillna(method = 'ffill')

In [9]:
def agg_func(s):
    return [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
grouped = df.groupby("Sentence #").apply(agg_func)
sentences = [s for s in grouped]

In [24]:
print(len(sentences))
sentences = sentences[:10000]
sentences[0]

47959


[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [25]:
#Lets find the number of words in the dataset
words = list(set(df["Word"].values))
n_words = len(words)
print(n_words)

35178


### Features for modelling

In [26]:
#Simple feature map to feed arrays into the classifier. 
def feature_map(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [27]:
df.loc[df['Tag'] == 'B-geo', 'Word'].head()

6        London
12         Iraq
65         Hyde
94      Britain
106    Brighton
Name: Word, dtype: object

In [28]:
#We divide the dataset into train and test sets
words = [feature_map(w) for w in df["Word"].values.tolist()]
tags = df["Tag"].values.tolist()

In [41]:
tags[0]

'O'

In [29]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20),X=words, y=tags, cv=5)

In [30]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.79      0.40     37644
       B-gpe       0.25      0.06      0.09     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.96      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.47      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

    accuracy              

## NER using CRF
The feature generation code is taken from the official crf suite https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

In [31]:
from itertools import chain
import nltk
import scipy.stats

import sklearn_crfsuite
from sklearn_crfsuite import scorers,CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics

In [32]:
# Feature set
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

In [33]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

In [34]:
#Creating the train and test set
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [35]:
#Creating the CRF model
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [36]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=3)



In [38]:
#Lets evaluate the mode
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)



              precision    recall  f1-score   support

       B-art       0.12      0.01      0.02        76
       B-eve       0.59      0.31      0.41        74
       B-geo       0.82      0.88      0.85      7715
       B-gpe       0.95      0.93      0.94      3257
       B-nat       0.18      0.05      0.08        39
       B-org       0.75      0.68      0.71      4329
       B-per       0.81      0.77      0.79      3469
       B-tim       0.91      0.85      0.88      4244
       I-art       0.22      0.03      0.06        60
       I-eve       0.45      0.14      0.22        63
       I-geo       0.79      0.74      0.77      1502
       I-gpe       0.79      0.38      0.51        29
       I-nat       0.25      0.08      0.12        13
       I-org       0.76      0.75      0.76      3524
       I-per       0.83      0.88      0.85      3600
       I-tim       0.82      0.71      0.76      1380
           O       0.99      0.99      0.99    186878

    accuracy              

In [39]:
X[0]

[{'bias': 1.0,
  'word.lower()': 'thousands',
  'word[-3:]': 'nds',
  'word[-2:]': 'ds',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'NNS',
  'postag[:2]': 'NN',
  'BOS': True,
  '+1:word.lower()': 'of',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'IN',
  '+1:postag[:2]': 'IN'},
 {'bias': 1.0,
  'word.lower()': 'of',
  'word[-3:]': 'of',
  'word[-2:]': 'of',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'IN',
  'postag[:2]': 'IN',
  '-1:word.lower()': 'thousands',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:postag': 'NNS',
  '-1:postag[:2]': 'NN',
  '+1:word.lower()': 'demonstrators',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'NNS',
  '+1:postag[:2]': 'NN'},
 {'bias': 1.0,
  'word.lower()': 'demonstrators',
  'word[-3:]': 'ors',
  'word[-2:]': 'rs',
  'word.isupper()': False,
  'word.istitle()': False,
  'wor