In [1]:
import matplotlib.pyplot as plt

In [2]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

A simple sentence NER example:

[**ORG** U.N. ] official [**PER** Ekeus ] heads for [**LOC** Baghdad ] 

We will concentrate on four types of named entities:
 * persons (**PER**), 
 * locations (**LOC**) 
 * organizations (**ORG**)
 * Others (**O**)

In [3]:
def _generate_examples(filepath):
        with open(filepath, encoding="utf-8") as f:
            sent = []
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    if sent:
                        yield sent
                        sent = []
                else:
                    splits = line.split(" ")
                    token = splits[0]
                    pos_tag = splits[1]
                    ner_tag = splits[3].rstrip()
                    if 'MISC' in ner_tag:
                        ner_tag = 'O'
                    
                    sent.append((token, pos_tag, ner_tag))

In [4]:
%%time 
# hint use the above defined function
train_sents = list(_generate_examples("train.txt"))
test_sents = list(_generate_examples("test.txt"))

CPU times: total: 391 ms
Wall time: 389 ms


In [5]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'postag': postag,
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True
    return features

In [6]:
test_sents[2]

[('United', 'NNP', 'B-LOC'),
 ('Arab', 'NNP', 'I-LOC'),
 ('Emirates', 'NNPS', 'I-LOC'),
 ('1996-12-06', 'CD', 'O')]

In [7]:
word2features(test_sents[2],0)

{'bias': 1.0, 'word.lower()': 'united', 'postag': 'NNP', 'BOS': True}

In [8]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [9]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: total: 562 ms
Wall time: 560 ms


In [10]:
%%time 
#search for sklearn_crfsuite.CRF, use the lbfgs algorithm, c parameters should be 0.1 and max iterations 100, all possible transactions true
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
   
)
# fit the model
try:
    crf.fit(X_train, y_train) # fit the model on the training set 
except AttributeError:
    pass

CPU times: total: 19.2 s
Wall time: 19.6 s


In [11]:
# save a list of all labels in your model, hint crfs have a classes attribute
labels = list(crf.classes_)

In [12]:
labels

['B-ORG', 'O', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-LOC']

In [13]:
#remove the label 'O' from your list
labels.remove('O')

In [14]:
#perfrom a prediction on your test set and print f1-score accuraccy 
y_pred = crf.predict(X_test) 

metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7757476721426669

In [15]:
# group B and I results, use the sorted function on the list labels with a lambda function as the key
sorted_labels =  sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)


In [16]:
# print(metrics.flat_classification_report(
#     y_test, y_pred, labels=sorted_labels, digits=3
# ))

In [17]:
# what is the number of transition features in our model, crfs have an attribute called transition_features_
len(crf.transition_features_)

49

In [18]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

#list the top 20 unlikely transitions
print("\nTop unlikely transitions:")


Top likely transitions:
B-PER  -> I-PER   6.591492
B-ORG  -> I-ORG   6.306534
I-ORG  -> I-ORG   5.540077
B-LOC  -> I-LOC   4.839887
I-LOC  -> I-LOC   3.758774
I-PER  -> I-PER   3.394919
O      -> B-PER   1.960743
O      -> O       1.369676
B-ORG  -> O       0.950664
O      -> B-LOC   0.919982
B-LOC  -> O       0.612921
B-PER  -> O       0.557646
O      -> B-ORG   0.515605
I-PER  -> O       0.393510
I-ORG  -> O       0.328486
I-LOC  -> O       -0.305074
B-ORG  -> B-ORG   -0.984217
B-LOC  -> B-LOC   -0.990422
I-LOC  -> B-LOC   -1.291094
B-PER  -> B-LOC   -1.315197

Top unlikely transitions:


In [19]:
# what is the number of transition features in our model, crfs have an attribute called state_features_
len(crf.state_features_)

16044

In [20]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

#list the top 30 positive
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

#list the top 30 negative
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])
#the sigmoid function is applied in this case. It gives a positive probability to (label_from, label_to) and negative probabilty 
#to the (label_to,label_from)

Top positive:
8.307293 I-LOC    word.lower():oval
8.088441 B-LOC    word.lower():m3
7.751193 B-ORG    word.lower():footscray
7.001409 B-ORG    word.lower():osce
6.964246 B-PER    word.lower():lebed
6.609227 B-LOC    word.lower():amsterdam
6.556081 B-LOC    word.lower():bonn
6.543649 B-LOC    word.lower():beijing
6.516252 B-LOC    word.lower():mideast
6.514129 B-ORG    word.lower():adelaide
6.341000 B-LOC    word.lower():balkans
6.296972 B-LOC    word.lower():med
6.293931 B-LOC    word.lower():stansted
6.251496 O        word.lower():to
6.164668 O        word.lower():division
6.109046 B-LOC    word.lower():vatican
6.065353 B-LOC    word.lower():johannesburg
6.051745 B-PER    word.lower():stenning
6.033868 B-LOC    word.lower():england
6.013143 B-PER    word.lower():clinton
5.989218 B-PER    word.lower():chang
5.986145 B-LOC    word.lower():pakistan
5.979900 B-LOC    word.lower():mt
5.896624 B-PER    word.lower():fogarty
5.840211 B-LOC    word.lower():moscow
5.801627 B-ORG    word.lower()