In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:100000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
df.isnull().sum()

Sentence #    95456
Word              0
POS               0
Tag               0
dtype: int64

In [4]:
df = df.fillna(method='ffill')

We have 4,544 sentences that contain 10,922 unique words and tagged by 17 tags.

In [5]:
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(4544, 10922, 17)

In [6]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [7]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,75
1,B-eve,53
2,B-geo,3303
3,B-gpe,1740
4,B-nat,30
5,B-org,1876
6,B-per,1668
7,B-tim,1823
8,I-art,43
9,I-eve,47


In [8]:
X = df.drop('Tag', axis=1)
X.head()

Unnamed: 0,Sentence #,Word,POS
0,Sentence: 1,Thousands,NNS
1,Sentence: 1,of,IN
2,Sentence: 1,demonstrators,NNS
3,Sentence: 1,have,VBP
4,Sentence: 1,marched,VBN


In [9]:
X.columns

Index(['Sentence #', 'Word', 'POS'], dtype='object')

In [10]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(100000, 15507)

In [11]:
y = df.Tag.values

In [12]:
classes = np.unique(y)

In [13]:
classes = classes.tolist()
classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [14]:
X.shape, y.shape

((100000, 15507), (100000,))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [16]:
X_train.shape, y_train.shape

((67000, 15507), (67000,))

### Perceptron

In [17]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

In [18]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1-- Epoch 1



Norm: 11.53, NNZs: 113, Bias: -3.000000, T: 67000, Avg. loss: 0.001060
Total training time: 2.20 seconds.
Norm: 13.42, NNZs: 162, Bias: -4.000000, T: 67000, Avg. loss: 0.001642Norm: 49.90, NNZs: 1337, Bias: -4.000000, T: 67000, Avg. loss: 0.015328Norm: 68.07, NNZs: 2642, Bias: -4.000000, T: 67000, Avg. loss: 0.041776
Total training time: 2.30 seconds.

Total training time: 2.34 seconds.

Total training time: 2.36 seconds.
-- Epoch 1-- Epoch 1
-- Epoch 1

-- Epoch 1
Norm: 8.43, NNZs: 57, Bias: -3.000000, T: 67000, Avg. loss: 0.000567
Total training time: 1.92 seconds.
-- Epoch 1
Norm: 48.83, NNZs: 1578, Bias: -4.000000, T: 67000, Avg. loss: 0.022328
Total training time: 1.98 seconds.
Norm: 56.87, NNZs: 2044, Bias: -4.000000, T: 67000, Avg. loss: 0.034970
Total training time: 1.98 seconds.
-- Epoch 1
-- Epoch 1
Norm: 44.41, NNZs: 1127, Bias: -4.000000, T: 67000, Avg. loss: 0.017164
Total training time: 2.03 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   20.3s


Norm: 10.44, NNZs: 106, Bias: -3.000000, T: 67000, Avg. loss: 0.001060
Total training time: 2.00 seconds.
-- Epoch 1
Norm: 11.45, NNZs: 96, Bias: -3.000000, T: 67000, Avg. loss: 0.000776
Total training time: 2.03 seconds.
-- Epoch 1
Norm: 35.13, NNZs: 803, Bias: -4.000000, T: 67000, Avg. loss: 0.011149
Total training time: 2.08 seconds.
-- Epoch 1
Norm: 11.00, NNZs: 102, Bias: -3.000000, T: 67000, Avg. loss: 0.001209
Total training time: 2.06 seconds.


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:   22.5s remaining:    9.3s


-- Epoch 1
Norm: 6.24, NNZs: 31, Bias: -3.000000, T: 67000, Avg. loss: 0.000209
Total training time: 2.11 seconds.
-- Epoch 1
Norm: 53.57, NNZs: 1703, Bias: -4.000000, T: 67000, Avg. loss: 0.026224
Total training time: 2.17 seconds.
Norm: 60.35, NNZs: 2091, Bias: -6.000000, T: 67000, Avg. loss: 0.026940
Total training time: 2.14 seconds.
Norm: 30.53, NNZs: 672, Bias: -4.000000, T: 67000, Avg. loss: 0.012030
Total training time: 2.09 seconds.


[Parallel(n_jobs=-1)]: Done  14 out of  17 | elapsed:   24.6s remaining:    5.2s


Norm: 73.89, NNZs: 2851, Bias: 4.000000, T: 67000, Avg. loss: 0.048866
Total training time: 1.60 seconds.


[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:   26.1s finished


Perceptron(max_iter=5, n_jobs=-1, verbose=10)

In [19]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        24
       B-eve       0.11      0.05      0.07        19
       B-geo       0.56      0.81      0.66      1085
       B-gpe       0.92      0.78      0.84       556
       B-nat       1.00      0.17      0.29        12
       B-org       0.39      0.52      0.44       589
       B-per       0.70      0.46      0.56       564
       B-tim       0.91      0.63      0.75       611
       I-art       0.00      0.00      0.00        12
       I-eve       0.67      0.22      0.33        18
       I-geo       0.75      0.42      0.54       230
       I-gpe       1.00      0.07      0.13        14
       I-nat       0.50      0.50      0.50         2
       I-org       0.48      0.50      0.49       445
       I-per       0.83      0.13      0.22       591
       I-tim       0.36      0.18      0.24       194

   micro avg       0.61      0.54      0.58      4966
   macro avg       0.57   

### Linear classifiers with SGD training

In [20]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

SGDClassifier()

In [21]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        24
       B-eve       0.00      0.00      0.00        19
       B-geo       0.41      0.93      0.57      1085
       B-gpe       0.96      0.62      0.75       556
       B-nat       0.00      0.00      0.00        12
       B-org       0.51      0.41      0.46       589
       B-per       0.92      0.36      0.51       564
       B-tim       0.93      0.62      0.74       611
       I-art       0.00      0.00      0.00        12
       I-eve       0.00      0.00      0.00        18
       I-geo       0.78      0.34      0.47       230
       I-gpe       1.00      0.07      0.13        14
       I-nat       0.00      0.00      0.00         2
       I-org       0.82      0.29      0.42       445
       I-per       0.72      0.36      0.48       591
       I-tim       0.33      0.03      0.05       194

   micro avg       0.58      0.52      0.55      4966
   macro avg       0.46   

### Naive Bayes classifier for multinomial models

In [22]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01)

In [23]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

              precision    recall  f1-score   support

       B-art       0.06      0.17      0.09        24
       B-eve       0.33      0.37      0.35        19
       B-geo       0.70      0.63      0.66      1085
       B-gpe       0.70      0.83      0.76       556
       B-nat       0.35      0.50      0.41        12
       B-org       0.41      0.44      0.43       589
       B-per       0.44      0.47      0.46       564
       B-tim       0.56      0.61      0.59       611
       I-art       0.07      0.08      0.08        12
       I-eve       0.46      0.33      0.39        18
       I-geo       0.40      0.52      0.46       230
       I-gpe       0.13      0.14      0.14        14
       I-nat       0.00      0.00      0.00         2
       I-org       0.50      0.51      0.51       445
       I-per       0.53      0.50      0.51       591
       I-tim       0.17      0.27      0.21       194

   micro avg       0.52      0.56      0.54      4966
   macro avg       0.36   

### Passive Aggressive Classifier

In [24]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

PassiveAggressiveClassifier()

In [25]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

       B-art       0.10      0.12      0.11        24
       B-eve       0.29      0.11      0.15        19
       B-geo       0.60      0.85      0.70      1085
       B-gpe       0.95      0.76      0.84       556
       B-nat       0.57      0.33      0.42        12
       B-org       0.52      0.48      0.50       589
       B-per       0.58      0.55      0.57       564
       B-tim       0.91      0.63      0.74       611
       I-art       1.00      0.08      0.15        12
       I-eve       0.44      0.22      0.30        18
       I-geo       0.66      0.50      0.57       230
       I-gpe       0.75      0.43      0.55        14
       I-nat       0.07      0.50      0.12         2
       I-org       0.86      0.27      0.40       445
       I-per       0.59      0.63      0.61       591
       I-tim       0.40      0.23      0.29       194

   micro avg       0.65      0.60      0.62      4966
   macro avg       0.58   

### Conditional Random Fields (CRFs)

In [26]:
#!pip install sklearn_crfsuite

In [27]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

#### Get sentences

In [28]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [29]:
getter = SentenceGetter(df)

In [30]:
sent = getter.get_next()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [31]:
sentences = getter.sentences

#### Features extraction

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts.

In [32]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

The above code were taken from sklearn-crfsuite official site.

Split train and test sets.

In [33]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [35]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [36]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.7842087494747214

In [37]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       1.00      0.03      0.07        29
       B-eve       0.86      0.25      0.39        24
       B-geo       0.75      0.88      0.81      1043
       B-gpe       0.89      0.78      0.83       588
       B-nat       0.67      0.20      0.31        10
       B-org       0.75      0.64      0.69       649
       B-per       0.81      0.81      0.81       546
       B-tim       0.90      0.85      0.87       589
       I-art       0.00      0.00      0.00         7
       I-eve       0.57      0.22      0.32        18
       I-geo       0.71      0.71      0.71       204
       I-gpe       0.47      0.53      0.50        17
       I-nat       1.00      0.50      0.67         2
       I-org       0.78      0.73      0.76       545
       I-per       0.80      0.90      0.85       574
       I-tim       0.79      0.68      0.73       185

   micro avg       0.80      0.78      0.79      5030
   macro avg       0.73   

In [38]:
#!pip install -U scikit-learn==0.23.0 --user

In [39]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=new_classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 17.2min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002B274580040>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002B274580340>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim']),
                   verbose=1)

In [40]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.044247062642500376, 'c2': 0.12428118387803151}
best CV score: 0.7725527128854058
model size: 0.87M


In [41]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

              precision    recall  f1-score   support

       B-art       1.00      0.03      0.07        29
       B-eve       1.00      0.25      0.40        24
       B-geo       0.75      0.87      0.81      1043
       B-gpe       0.89      0.78      0.83       588
       B-nat       0.50      0.10      0.17        10
       B-org       0.74      0.63      0.68       649
       B-per       0.81      0.81      0.81       546
       B-tim       0.89      0.84      0.87       589
       I-art       0.00      0.00      0.00         7
       I-eve       1.00      0.22      0.36        18
       I-geo       0.68      0.70      0.69       204
       I-gpe       0.47      0.53      0.50        17
       I-nat       0.00      0.00      0.00         2
       I-org       0.79      0.72      0.75       545
       I-per       0.80      0.90      0.85       574
       I-tim       0.79      0.67      0.73       185

   micro avg       0.79      0.78      0.79      5030
   macro avg       0.69   

In [42]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-art  -> I-art   5.143950
I-tim  -> I-tim   5.031263
B-geo  -> I-geo   4.949383
B-eve  -> I-eve   4.872442
B-tim  -> I-tim   4.862833
B-org  -> I-org   4.777482
I-art  -> I-art   4.773278
B-per  -> I-per   4.385622
B-gpe  -> I-gpe   4.378191
I-eve  -> I-eve   4.341393
I-org  -> I-org   4.325777
I-geo  -> I-geo   4.305822
I-gpe  -> I-gpe   4.018251
I-per  -> I-per   3.816699
O      -> O       3.707508
B-nat  -> I-nat   3.181767
O      -> B-per   2.415582
B-org  -> B-art   2.249139
I-nat  -> I-nat   2.085878
O      -> B-tim   1.474945

Top unlikely transitions:
O      -> I-gpe   -1.791538
B-org  -> I-geo   -1.848020
B-gpe  -> I-org   -1.852100
B-geo  -> I-gpe   -1.883527
B-geo  -> I-org   -2.162487
B-org  -> B-org   -2.183622
O      -> I-art   -2.270019
I-org  -> B-org   -2.286064
B-gpe  -> I-geo   -2.290364
B-geo  -> I-per   -2.402567
B-tim  -> B-tim   -2.443054
I-org  -> I-per   -2.443216
B-org  -> I-per   -2.574446
I-per  -> B-per   -2.609850
O      -> I-per  

It is very likely that the beginning of a geographical entity (B-geo) will be followed by a token inside geographical entity (I-geo), but transitions to inside of an organization name (I-org) from tokens with other labels are penalized hugely.

In [43]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.058571 O        BOS
4.710062 B-tim    word[-3:]:day
4.355702 O        bias
3.737799 O        word.lower():jewish
3.654285 B-tim    word[-2:]:ay
3.610721 B-per    BOS
3.529736 O        word.lower():kurdish
3.482241 I-tim    word[-3:]:day
3.467869 O        word[-2:]:N1
3.428275 B-org    word.lower():al-qaida
3.404446 B-per    word.lower():president
3.298697 B-tim    word.lower():thanksgiving
3.227787 B-gpe    word.istitle()
3.129723 B-tim    +1:word.lower():year
3.111664 B-tim    word.lower():afternoon
3.070388 B-tim    word[-2:]:0s
3.032962 B-tim    word[-3:]:ber
2.953384 B-org    word.lower():hamas
2.929158 B-per    word.lower():obama
2.919411 B-gpe    word.lower():nepal
2.912580 I-geo    +1:word.lower():town
2.903796 B-org    word.lower():parliament
2.900755 B-gpe    word[-3:]:pal
2.850746 B-org    word[-3:]:ban
2.842808 B-org    -1:word.lower():telephoned
2.827935 B-per    word.lower():gotovina
2.816246 B-geo    word.lower():mid-september
2.813247 B-org    -1:word.low

Observations: 

1). __```5.183603 B-tim word[-3]:day```__
The model learns that if a nearby word was “day” then the token is likely a part of a Time indicator.

2). __```3.370614 B-per word.lower():president```__
The model learns that token "president" is likely to be at the beginning of a person name.

3). __```-3.521244 O postag:NNP```__
The model learns that proper nouns are often entities.

4). __```-3.087828 O word.isdigit()```__
Digits are likely entities.

5). __```-3.233526 O word.istitle()```__
TitleCased words are likely entities.

### ELI5

ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. ELI5 allows to check weights of sklearn_crfsuite.CRF models.

In [46]:
#!pip install eli5

In [47]:
import eli5

eli5.show_weights(crf, top=10)

Using TensorFlow backend.


From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.708,0.635,-2.27,1.318,-1.783,1.222,-4.46,0.538,-1.792,0.372,-1.004,0.783,-4.466,2.416,-3.107,1.475,-3.884
B-art,-0.6,0.0,5.144,0.0,0.0,-0.127,-0.266,-0.268,-0.16,0.0,0.0,0.439,-0.38,-0.517,-0.759,-0.605,-0.138
I-art,-0.832,-0.121,4.773,0.0,0.0,0.309,-0.36,-0.185,0.0,0.0,0.0,-0.356,-0.34,-0.6,-0.522,0.037,-0.256
B-eve,-0.708,0.0,0.0,0.0,4.872,-0.091,-0.149,-0.241,-0.025,0.0,0.0,-0.454,-0.297,-0.654,-0.352,-0.634,-0.408
I-eve,-0.319,0.0,0.0,-0.541,4.341,-0.211,-0.189,0.0,0.0,0.0,0.0,-0.177,-0.242,-0.229,-0.381,-0.36,-0.016
B-geo,0.297,1.236,-0.828,-0.074,-0.727,-1.403,4.949,0.436,-1.884,0.0,-0.419,-1.138,-2.162,-0.965,-2.403,1.467,-1.21
I-geo,0.144,0.0,-0.211,0.0,-0.11,-0.807,4.306,-0.521,-0.577,0.0,0.0,-0.865,-0.955,-0.874,-0.693,1.114,-0.84
B-gpe,0.615,-0.204,-0.798,-0.152,-0.605,-0.108,-2.29,-3.367,4.378,0.0,-0.153,0.703,-1.852,0.757,-1.455,-0.522,-0.957
I-gpe,-0.299,0.0,0.0,0.0,0.0,0.274,-0.667,-0.412,4.018,0.0,0.0,-0.375,-0.519,-0.452,-0.597,-0.58,-0.1
B-nat,-0.46,0.0,0.0,0.0,0.0,-0.077,-0.093,-0.068,0.0,0.0,3.182,-0.197,-0.263,-0.32,-0.359,-0.162,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+5.059,BOS,,,,,,,,,,,,,,,
+4.356,bias,,,,,,,,,,,,,,,
+3.738,word.lower():jewish,,,,,,,,,,,,,,,
+3.530,word.lower():kurdish,,,,,,,,,,,,,,,
+3.468,word[-2:]:N1,,,,,,,,,,,,,,,
… 2369 more positive …,… 2369 more positive …,,,,,,,,,,,,,,,
… 1239 more negative …,… 1239 more negative …,,,,,,,,,,,,,,,
-2.943,+1:word.lower():last,,,,,,,,,,,,,,,
-3.247,word.istitle(),,,,,,,,,,,,,,,
-3.272,word.isdigit(),,,,,,,,,,,,,,,

Weight?,Feature
+5.059,BOS
+4.356,bias
+3.738,word.lower():jewish
+3.530,word.lower():kurdish
+3.468,word[-2:]:N1
… 2369 more positive …,… 2369 more positive …
… 1239 more negative …,… 1239 more negative …
-2.943,+1:word.lower():last
-3.247,word.istitle()
-3.272,word.isdigit()

Weight?,Feature
+2.158,word.lower():twitter
+2.113,word.lower():english
+1.882,-1:word.lower():tamilnet
+1.552,-1:word.lower():newspaper
+1.530,word.lower():jeep
+1.521,word.lower():dodge
+1.473,word[-3:]:eep
+1.459,word[-2:]:ep
+1.456,word[-3:]:/20
+1.456,word.lower():20/20

Weight?,Feature
+1.215,+1:word.lower():airport
+1.151,word.lower():constitution
+1.069,-1:word.lower():international
+1.048,word[-3:]:Us
+1.048,word[-2:]:Us
+1.047,-1:word.lower():magazine
+1.012,word.lower():us
+0.964,word.lower():simple
+0.962,+1:word.lower():life
+0.953,+1:word.lower():newspaper

Weight?,Feature
+1.969,-1:word.lower():war
+1.693,-1:word.lower():first
+1.458,word.lower():christmas
+1.456,-1:word.lower():celebrated
+1.380,+1:word.lower():get
+1.319,word[-3:]:mas
+1.297,word.lower():ii
+1.297,word[-3:]:II
+1.292,word[-2:]:II
+1.271,word[-3:]:mes

Weight?,Feature
+1.231,postag:NNPS
+1.081,word.lower():cup
+1.081,word[-3:]:Cup
+1.071,-1:word.lower():korean
+1.059,word[-2:]:rs
+1.034,word.lower():peace
+1.033,word[-2:]:up
+0.979,word[-3:]:pen
+0.969,word.lower():open
+0.937,+1:word.lower():in

Weight?,Feature
+2.816,word.lower():mid-september
+2.797,-1:word.lower():serb
+2.739,word.lower():washington
+2.696,word.lower():aswat
+2.452,word.lower():zahedan
+2.415,word[-3:]:the
+2.405,word.lower():beijing
+2.291,word[-2:]:ai
+2.253,+1:word.lower():province
… 1550 more positive …,… 1550 more positive …

Weight?,Feature
+2.913,+1:word.lower():town
+2.651,+1:word.lower():achieved
+2.419,+1:word.lower():block
+2.013,word.lower():settlement
+1.950,-1:word.lower():tulkarem
+1.883,+1:word.lower():produced
+1.877,-1:word.lower():western
+1.767,+1:word.lower():base
+1.679,+1:word.lower():regional
+1.639,+1:word.lower():about

Weight?,Feature
+3.228,word.istitle()
+2.919,word.lower():nepal
+2.901,word[-3:]:pal
+2.734,+1:word.lower():mayor
+2.720,postag:NNS
+2.663,word[-3:]:ans
+2.596,+1:word.lower():representative
+2.419,word.lower():croats
+2.406,word.lower():palestinian
+2.398,word[-3:]:ese

Weight?,Feature
+2.531,+1:word.lower():began
+1.999,-1:word.lower():soviet
+1.815,+1:word.lower():that
+1.753,+1:word.lower():health
+1.745,+1:word.lower():returned
+1.631,-1:word.lower():democratic
+1.580,word[-3:]:can
+1.567,-1:word.lower():bosnian
+1.497,+1:word.istitle()
+1.485,word.lower():city

Weight?,Feature
+1.601,word.isupper()
+1.494,-1:word.lower():from
+1.413,word[-3:]:5N1
+1.413,word.lower():h5n1
+1.367,+1:word.lower():form
+1.312,+1:word.lower():katrina
+1.306,word.lower():hurricane
+1.301,word[-3:]:ane
+1.272,word.lower():marburg
+1.268,+1:word.lower():toll

Weight?,Feature
+1.264,-1:word.lower():hurricane
+1.252,word.lower():katrina
+1.112,+1:word.lower():outbreak
+1.031,word[-3:]:ina
+0.995,word[-2:]:na
+0.820,-1:word.lower():jing
+0.819,-1:postag:NNP
+0.747,word.lower():acute
+0.746,+1:word.lower():respiratory
+0.746,-1:word.lower():severe

Weight?,Feature
+3.428,word.lower():al-qaida
+2.953,word.lower():hamas
+2.904,word.lower():parliament
+2.851,word[-3:]:ban
+2.843,-1:word.lower():telephoned
+2.813,-1:word.lower():brunei
+2.673,+1:word.lower():fought
+2.549,+1:word.lower():influence
+2.477,+1:word.lower():assistant
+2.466,-1:word.lower():extremist

Weight?,Feature
+2.265,+1:word.lower():mr.
+1.865,-1:word.lower():mediterranean
+1.795,-1:word.lower():munich
+1.756,word.lower():ministry
+1.734,word[-3:]:ate
+1.721,word[-3:]:ons
+1.658,-1:word.lower():for
+1.642,+1:word.lower():will
+1.576,word.lower():nations
… 1350 more positive …,… 1350 more positive …

Weight?,Feature
+3.611,BOS
+3.404,word.lower():president
+2.929,word.lower():obama
+2.828,word.lower():gotovina
+2.784,word.lower():prime
+2.660,word.lower():jupiter
+2.472,+1:word.lower():vladimir
+2.419,word.lower():bolton
+2.415,+1:word.lower():administration
+2.371,word.lower():western

Weight?,Feature
+1.674,+1:word.lower():david
+1.615,+1:word.lower():saad
+1.574,word.lower():president
+1.386,+1:word.lower():reports
+1.375,-1:postag:NN
+1.363,-1:word.lower():masjid
+1.353,word[-3:]:aad
+1.353,word.lower():saad
… 1296 more positive …,… 1296 more positive …
… 236 more negative …,… 236 more negative …

Weight?,Feature
+4.710,word[-3:]:day
+3.654,word[-2:]:ay
+3.299,word.lower():thanksgiving
+3.130,+1:word.lower():year
+3.112,word.lower():afternoon
+3.070,word[-2:]:0s
+3.033,word[-3:]:ber
+2.748,word.lower():august
+2.728,word.lower():january
+2.679,+1:word.lower():czech

Weight?,Feature
+3.482,word[-3:]:day
+2.367,word[-2:]:ay
+2.172,-1:word.lower():ceremonies
+2.100,+1:word.lower():moscow
+2.085,word.lower():decades
+1.967,-1:word.lower():march
+1.961,word[-3:]:des
+1.847,word.isdigit()
+1.727,-1:word.lower():june
+1.720,word[-2:]:ry


It does make sense that I-entity must follow B-entity, such as I-geo follows B-geo, I-org follows B-org, I-per follows B-per, and so on. 

We can also see that it is not common in this dataset to have a person right after an organization name (B-org -> I-per has a large negative weight).

If we regularize CRF more, we can expect that only features which are generic will remain, and memoized tokens will go. Let’s check what effect does regularization have on CRF weights:

In [48]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,1.782,0.0,0.0,0.0,0.0,1.456,0.0,0.303,0.0,0.0,0.0,0.791,0.0,0.062,0.0,1.709,0.0
B-art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-geo,0.23,0.0,0.0,0.0,0.0,0.0,2.704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-geo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-gpe,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-gpe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-nat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
+3.691,bias,,,,,,,,,,,,,,,
+1.774,BOS,,,,,,,,,,,,,,,
+0.984,-1:postag[:2]:NN,,,,,,,,,,,,,,,
+0.435,postag[:2]:VB,,,,,,,,,,,,,,,
+0.218,EOS,,,,,,,,,,,,,,,
… 10 more positive …,… 10 more positive …,,,,,,,,,,,,,,,
… 1 more negative …,… 1 more negative …,,,,,,,,,,,,,,,
-0.467,postag:CD,,,,,,,,,,,,,,,
-0.467,postag[:2]:CD,,,,,,,,,,,,,,,
-1.472,word.isdigit(),,,,,,,,,,,,,,,

Weight?,Feature
+3.691,bias
+1.774,BOS
+0.984,-1:postag[:2]:NN
+0.435,postag[:2]:VB
+0.218,EOS
… 10 more positive …,… 10 more positive …
… 1 more negative …,… 1 more negative …
-0.467,postag:CD
-0.467,postag[:2]:CD
-1.472,word.isdigit()

Weight?,Feature
1.052,postag:NNP
0.534,word.istitle()
0.218,-1:postag:IN
0.218,-1:postag[:2]:IN
0.125,-1:word.lower():in
-0.289,-1:postag[:2]:NN

Weight?,Feature
0.267,-1:postag:NNP

Weight?,Feature
1.533,postag:JJ
1.506,postag[:2]:JJ
1.139,word.istitle()
0.549,word[-2:]:an
-0.033,postag:NNP

Weight?,Feature
0.806,postag:NNP
0.565,postag[:2]:NN
0.23,-1:postag[:2]:DT
0.23,-1:postag:DT
0.004,word.isupper()

Weight?,Feature
0.496,-1:postag:NNP
0.377,-1:word.istitle()
0.225,-1:postag[:2]:NN

Weight?,Feature
0.51,postag:NNP
0.438,+1:postag:NNP
0.308,+1:word.istitle()
0.075,postag[:2]:NN
0.022,word.istitle()
0.002,+1:postag[:2]:NN

Weight?,Feature
0.881,-1:postag:NNP
0.48,-1:postag[:2]:NN
0.404,-1:word.istitle()
0.196,postag:NNP

Weight?,Feature
1.74,word[-2:]:ay
1.657,word[-3:]:day
0.204,postag[:2]:CD
0.204,postag:CD
0.096,bias
0.033,-1:postag[:2]:IN
0.033,-1:postag:IN


In [49]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.561,0.747,-2.344,0.928,-1.847,1.1,-4.541,0.523,-1.684,0.239,-1.025,0.795,-4.779,1.64,-3.017,1.39,-4.202
B-art,-0.556,0.0,5.467,0.0,0.0,-0.021,-0.085,-0.145,0.0,0.0,0.0,0.478,-0.328,-0.735,-0.394,-0.41,0.0
I-art,-0.74,0.0,5.438,0.0,0.0,0.509,-0.075,-0.079,0.0,0.0,0.0,-0.091,-0.315,-0.777,-0.246,0.19,-0.062
B-eve,-0.213,0.0,0.0,0.0,5.415,0.0,0.0,-0.12,0.0,0.0,0.0,-0.123,-0.319,-0.662,-0.397,-0.512,-0.193
I-eve,-0.239,0.0,0.0,-0.279,4.838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.013,-0.215,-0.002,-0.236,0.0
B-geo,0.31,1.357,-0.737,0.0,-0.498,-1.395,5.227,0.733,-1.392,0.0,-0.233,-0.973,-1.993,-1.367,-1.873,1.648,-1.25
I-geo,-0.01,0.0,-0.123,0.0,0.0,-0.88,4.457,-0.486,-0.606,0.0,0.0,-0.766,-0.944,-1.214,-0.579,1.036,-0.96
B-gpe,0.623,0.0,-0.581,-0.029,-0.427,-0.116,-2.327,-3.359,4.986,0.0,0.0,0.901,-1.945,0.294,-1.231,-0.431,-0.876
I-gpe,-0.21,0.0,0.0,0.0,0.0,0.24,-0.291,-0.233,4.878,0.0,0.0,-0.101,-0.352,-0.385,-0.263,-0.379,0.0
B-nat,-0.402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.694,0.0,-0.016,-0.48,-0.194,0.0,0.0


The model learned large negative weights for impossible transitions like O -> I-geo, O -> I-org and O -> I-tim, and so on.

In order to easy to read, we can check only a subset of tags.

In [50]:
eli5.show_weights(crf, top=10, targets=['O', 'B-org', 'I-per'])

From \ To,O,B-org,I-per
O,3.561,0.795,-3.017
B-org,0.086,-2.285,-2.309
I-per,-0.066,-1.147,3.898

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+4.720,BOS,
+4.236,bias,
+4.046,word.lower():jewish,
+3.494,word.lower():kurdish,
+3.435,word[-2:]:N1,
+3.031,+1:word.lower():minister,
… 1685 more positive …,… 1685 more positive …,
… 983 more negative …,… 983 more negative …,
-3.080,+1:word.lower():last,
-3.387,word.istitle(),

Weight?,Feature
+4.720,BOS
+4.236,bias
+4.046,word.lower():jewish
+3.494,word.lower():kurdish
+3.435,word[-2:]:N1
+3.031,+1:word.lower():minister
… 1685 more positive …,… 1685 more positive …
… 983 more negative …,… 983 more negative …
-3.080,+1:word.lower():last
-3.387,word.istitle()

Weight?,Feature
+3.603,word.lower():al-qaida
+3.340,word.lower():hamas
+3.270,word.lower():parliament
+3.041,-1:word.lower():telephoned
+2.991,-1:word.lower():brunei
+2.966,+1:word.lower():fought
+2.894,word[-3:]:ban
+2.754,-1:word.lower():extremist
+2.654,+1:word.lower():influence
+2.630,word.lower():westerners

Weight?,Feature
+1.788,+1:word.lower():david
+1.709,+1:word.lower():saad
+1.659,+1:word.lower():reports
+1.519,+1:word.lower():clinton
+1.491,-1:postag:NN
+1.482,word.lower():rice
+1.385,-1:word.lower():masjid
… 858 more positive …,… 858 more positive …
… 182 more negative …,… 182 more negative …
-1.365,bias


Or check only some of the features for all tags.

In [51]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
-2.186,word.isupper()
-2.72,word.isdigit()
-3.387,word.istitle()

Weight?,Feature
0.151,word.istitle()
-0.214,word.isupper()

Weight?,Feature
0.607,word.istitle()
0.597,word.isdigit()

Weight?,Feature
1.185,word.isupper()
0.391,word.isdigit()
-0.2,word.istitle()

Weight?,Feature
0.919,word.isupper()
0.069,word.istitle()

Weight?,Feature
1.264,word.istitle()
-0.046,word.isupper()
-0.728,word.isdigit()

Weight?,Feature
0.726,word.istitle()
0.534,word.isdigit()
-0.0,word.isupper()

Weight?,Feature
2.97,word.istitle()
1.333,word.isupper()

Weight?,Feature
0.21,word.istitle()
-0.167,word.isupper()

Weight?,Feature
1.622,word.isupper()
-0.252,word.istitle()

Weight?,Feature
0.003,word.istitle()

Weight?,Feature
1.978,word.isupper()
0.0,word.istitle()
-0.804,word.isdigit()

Weight?,Feature
0.366,word.istitle()
0.021,word.isupper()
-0.443,word.isdigit()

Weight?,Feature
0.146,word.istitle()
-0.098,word.isdigit()
-1.003,word.isupper()

Weight?,Feature
0.208,word.istitle()
-0.02,word.isdigit()
-0.391,word.isupper()

Weight?,Feature
2.573,word.isdigit()
-0.435,word.istitle()
-1.133,word.isupper()

Weight?,Feature
1.978,word.isdigit()
-0.286,word.isupper()
-1.304,word.istitle()
