In [1]:
###### https://www.kaggle.com/c/text-normalization-challenge-english-language


import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,sentence_id,token_id,class,before,after,poc
0,0,0,PLAIN,Brillantaisia,Brillantaisia,NN
1,0,1,PLAIN,is,is,VBZ
2,0,2,PLAIN,a,a,DT
3,0,3,PLAIN,genus,genus,NN
4,0,4,PLAIN,of,of,IN


In [3]:
df.isnull().sum()

sentence_id     0
token_id        0
class           0
before         74
after          69
poc             0
dtype: int64

In [4]:
df = df.fillna(method='ffill')

In [5]:
df.isnull().sum()

sentence_id    0
token_id       0
class          0
before         0
after          0
poc            0
dtype: int64

In [6]:
df['sentence_id'].nunique(), df.before.nunique(), df['class'].nunique()

(748066, 486442, 16)

In [7]:
df.groupby('class').size().reset_index(name='counts')

Unnamed: 0,class,counts
0,ADDRESS,522
1,CARDINAL,133744
2,DATE,258348
3,DECIMAL,9821
4,DIGIT,5442
5,ELECTRONIC,5162
6,FRACTION,1196
7,LETTERS,152795
8,MEASURE,14783
9,MONEY,6128


In [8]:
X = df.drop(['sentence_id','class','after','token_id'], axis=1)
X.head()


Unnamed: 0,before,poc
0,Brillantaisia,NN
1,is,VBZ
2,a,DT
3,genus,NN
4,of,IN


In [9]:
X.columns

Index(['before', 'poc'], dtype='object')

In [10]:
v = DictVectorizer()

In [11]:
X = v.fit_transform(X.to_dict('records'))


In [12]:
X.shape

(9918441, 486485)

In [13]:
y = df['class'].values

In [14]:
classes = np.unique(y)

In [15]:
classes = classes.tolist()
classes

['ADDRESS',
 'CARDINAL',
 'DATE',
 'DECIMAL',
 'DIGIT',
 'ELECTRONIC',
 'FRACTION',
 'LETTERS',
 'MEASURE',
 'MONEY',
 'ORDINAL',
 'PLAIN',
 'PUNCT',
 'TELEPHONE',
 'TIME',
 'VERBATIM']

In [16]:
X.shape, y.shape

((9918441, 486485), (9918441,))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [18]:
X_train.shape, y_train.shape

((6645355, 486485), (6645355,))

In [19]:
X_test.shape, y_test.shape

((3273086, 486485), (3273086,))

In [20]:

new_classes = classes.copy()
new_classes.pop()
new_classes

['ADDRESS',
 'CARDINAL',
 'DATE',
 'DECIMAL',
 'DIGIT',
 'ELECTRONIC',
 'FRACTION',
 'LETTERS',
 'MEASURE',
 'MONEY',
 'ORDINAL',
 'PLAIN',
 'PUNCT',
 'TELEPHONE',
 'TIME']

In [None]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

In [None]:

print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

In [None]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

In [None]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

In [None]:

nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

In [None]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

In [21]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [22]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['before'].values.tolist(), 
                                                           s['poc'].values.tolist(), 
                                                           s['class'].values.tolist())]
        self.grouped = self.data.groupby('sentence_id').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [23]:
getter = SentenceGetter(df)


In [24]:
sent = getter.get_next()
print(sent)

None


In [25]:
sentences = getter.sentences


In [26]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [27]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)


In [29]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=50,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [30]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.99666269705738553

In [31]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

             precision    recall  f1-score   support

    ADDRESS       0.97      0.42      0.59       154
   CARDINAL       0.97      0.97      0.97     44148
       DATE       0.98      0.99      0.99     85286
    DECIMAL       0.94      0.92      0.93      3280
      DIGIT       0.77      0.61      0.68      1719
 ELECTRONIC       0.96      0.78      0.86      1623
   FRACTION       0.88      0.68      0.77       410
    LETTERS       0.96      0.93      0.94     50714
    MEASURE       1.00      0.99      0.99      4902
      MONEY       0.99      0.91      0.95      2114
    ORDINAL       0.99      0.97      0.98      4220
      PLAIN       1.00      1.00      1.00   2426119
      PUNCT       1.00      1.00      1.00    620326
  TELEPHONE       0.94      0.84      0.88      1377
       TIME       0.96      0.88      0.92       465

avg / total       1.00      1.00      1.00   3246857



In [None]:
X_train[0]

In [None]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=50,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=new_classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=25,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

In [32]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
TIME   -> DECIMAL 2.537920
MEASURE -> MEASURE 2.089429
DECIMAL -> DECIMAL 1.992695
DIGIT  -> LETTERS 1.890921
DATE   -> ELECTRONIC 1.837079
PLAIN  -> PLAIN   1.689222
VERBATIM -> VERBATIM 1.644348
ELECTRONIC -> DATE    1.492422
TIME   -> CARDINAL 1.414490
DATE   -> TIME    1.414211
ELECTRONIC -> ELECTRONIC 1.370165
ADDRESS -> CARDINAL 1.323073
DIGIT  -> DECIMAL 1.281697
CARDINAL -> CARDINAL 1.267240
LETTERS -> DIGIT   1.138752
DATE   -> DIGIT   1.127022
FRACTION -> DATE    1.093341
FRACTION -> CARDINAL 0.965752
PLAIN  -> DATE    0.910143
PLAIN  -> MONEY   0.900885

Top unlikely transitions:
ORDINAL -> LETTERS -1.181543
MONEY  -> LETTERS -1.205115
CARDINAL -> TIME    -1.260073
VERBATIM -> ELECTRONIC -1.292514
VERBATIM -> LETTERS -1.343782
ORDINAL -> ELECTRONIC -1.345779
VERBATIM -> ADDRESS -1.348680
ADDRESS -> ELECTRONIC -1.393938
LETTERS -> DECIMAL -1.439153
ADDRESS -> TELEPHONE -1.585387
VERBATIM -> CARDINAL -1.587668
ORDINAL -> CARDINAL -1.608915
ADDRESS -> DI

In [34]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
12.905875 DATE     word[-2:]:0s
10.714175 LETTERS  word.lower():eds
9.847006 PLAIN    word.lower():one
9.245423 PLAIN    word.lower():three
8.599327 DIGIT    word.lower():000
8.077553 LETTERS  word.lower():als
8.070118 LETTERS  word.lower():mes
8.013081 PLAIN    word.lower():million
7.829573 PLAIN    word.lower():two
7.786055 PLAIN    word.lower():four
7.640103 ORDINAL  word[-3:]:0th
7.618228 LETTERS  word.lower():métis
7.512676 MEASURE  word[-2:]:0%
7.507092 PLAIN    word.lower():mid
7.403130 DECIMAL  word[-2:]:.9
7.357601 ORDINAL  word[-3:]:7th
7.292916 TIME     word[-3:]::00
7.216838 DATE     word[-3:]:0's
7.206331 LETTERS  word.lower():iasi
7.146557 LETTERS  word.lower():ses
7.120744 ELECTRONIC word[-3:]:.uk
7.099762 MEASURE  word[-2:]:5%
7.093452 DIGIT    -1:word.lower():mid
7.076363 LETTERS  word.lower():tra
7.062820 MEASURE  word[-2:]:7%
7.000630 MEASURE  word[-2:]:9%
6.990388 ORDINAL  word[-3:]:5th
6.954561 MEASURE  word[-2:]:8%
6.946697 MEASURE  word[-2:]: m
6.92

In [40]:
import eli5

eli5.show_weights(crf, top=50)

From \ To,ADDRESS,CARDINAL,DATE,DECIMAL,DIGIT,ELECTRONIC,FRACTION,LETTERS,MEASURE,MONEY,ORDINAL,PLAIN,PUNCT,TELEPHONE,TIME,VERBATIM
ADDRESS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.289,-0.128,0.0,0.0,0.0
CARDINAL,0.0,0.129,-0.118,0.0,0.0,0.0,0.0,0.004,0.0,0.0,0.0,0.229,0.194,0.0,0.0,0.168
DATE,0.0,-0.061,-0.237,0.0,0.0,0.0,0.0,-0.127,0.0,0.0,0.0,0.199,0.282,0.0,0.0,-0.111
DECIMAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.204,0.0,0.0,0.0,0.0
DIGIT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.723,0.0,0.0,0.0,-0.232,-0.026,0.0,0.0,0.0
ELECTRONIC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.082,0.306,0.0,0.0,0.0
FRACTION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.256,-0.039,0.0,0.0,0.0
LETTERS,0.0,0.174,-0.265,-0.003,0.711,0.0,0.0,-0.704,-0.016,0.0,-0.0,0.5,-0.093,0.731,0.0,-1.05
MEASURE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.268,0.0,0.0,-0.002
MONEY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013,0.115,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15
+0.049,word.isupper(),,,,,,,,,,,,,,
-0.005,word.isdigit(),,,,,,,,,,,,,,
-0.005,-1:postag[:2]:IN,,,,,,,,,,,,,,
-0.005,-1:postag:IN,,,,,,,,,,,,,,
-0.012,postag:NN,,,,,,,,,,,,,,
-0.013,BOS,,,,,,,,,,,,,,
-0.015,+1:word.lower():.,,,,,,,,,,,,,,
-0.016,word.istitle(),,,,,,,,,,,,,,
-0.016,+1:postag[:2]:.,,,,,,,,,,,,,,
-0.016,+1:postag:.,,,,,,,,,,,,,,

Weight?,Feature
0.049,word.isupper()
-0.005,word.isdigit()
-0.005,-1:postag[:2]:IN
-0.005,-1:postag:IN
-0.012,postag:NN
-0.013,BOS
-0.015,+1:word.lower():.
-0.016,word.istitle()
-0.016,+1:postag[:2]:.
-0.016,+1:postag:.

Weight?,Feature
+2.173,word.isdigit()
+1.509,postag[:2]:CD
+1.509,postag:CD
+1.024,word[-2:]:00
+0.951,word[-2:]:II
+0.854,word.isupper()
+0.819,word.lower():1
+0.819,word[-3:]:1
+0.819,word[-2:]:1
+0.785,+1:postag:NNS

Weight?,Feature
+2.121,word.istitle()
+1.970,-1:word.lower():on
+1.920,word.isdigit()
+1.364,postag[:2]:CD
+1.364,postag:CD
+1.326,-1:word.lower():retrieved
+1.317,postag:NNP
+1.168,word[-3:]:010
+1.118,word[-3:]:011
+1.084,word[-2:]:09

Weight?,Feature
+1.124,postag[:2]:CD
+1.124,postag:CD
+0.572,-1:postag:VBD
+0.409,+1:postag:NNS
+0.397,-1:word.lower():was
+0.262,-1:postag[:2]:VB
+0.184,-1:word.lower():p
+0.156,word[-2:]:on
+0.155,word[-3:]:ion
+0.143,word[-2:]:.5

Weight?,Feature
1.188,word.isdigit()
0.868,postag[:2]:CD
0.868,postag:CD
0.803,-1:word.isupper()
0.718,+1:word.isupper()
0.614,-1:postag[:2]:NN
0.397,-1:word.istitle()
0.33,-1:postag:NN
0.306,+1:postag:NN
0.223,+1:word.istitle()

Weight?,Feature
0.737,postag:NN
0.684,postag[:2]:NN
0.683,word[-3:]:com
0.676,word[-2:]:om
0.095,BOS
0.081,-1:postag:IN
0.081,-1:postag[:2]:IN
0.066,"+1:word.lower():"""
0.066,+1:postag:``
0.066,+1:postag[:2]:``

Weight?,Feature
0.065,postag[:2]:CD
0.065,postag:CD
-0.0,+1:postag[:2]:CD
-0.0,+1:postag:CD
-0.005,BOS
-0.008,+1:word.lower():.
-0.008,+1:postag[:2]:.
-0.008,+1:postag:.
-0.011,word.isdigit()
-0.015,+1:word.istitle()

Weight?,Feature
+3.618,word.isupper()
+1.193,postag[:2]:NN
+0.899,postag:NNP
+0.770,word[-2:]:A.
+0.629,word[-2:]:S.
+0.542,+1:postag[:2]:CD
+0.542,+1:postag:CD
+0.529,word.lower():a.
+0.529,word[-3:]:A.
+0.509,postag:NN

Weight?,Feature
+1.640,postag[:2]:CD
+1.640,postag:CD
+0.492,+1:word.lower():of
+0.416,-1:word.lower():(
+0.416,-1:postag:(
+0.416,-1:postag[:2]:(
+0.403,+1:postag[:2]:)
+0.403,+1:postag:)
+0.403,+1:word.lower():)
+0.356,word[-2:]:km

Weight?,Feature
1.467,postag:$
1.467,postag[:2]:$
0.427,-1:postag[:2]:VB
0.392,word[-3:]:ion
0.389,word[-2:]:00
0.318,-1:postag:IN
0.318,-1:postag[:2]:IN
0.315,word[-2:]:on
0.279,word[-3:]:000
0.272,bias

Weight?,Feature
+2.367,word[-2:]:th
+0.585,postag[:2]:CD
+0.585,postag:CD
+0.505,+1:word.lower():century
+0.490,-1:word.lower():the
+0.487,postag:NNS
+0.474,+1:postag:NN
+0.414,+1:postag[:2]:NN
+0.339,-1:postag[:2]:DT
+0.339,-1:postag:DT

Weight?,Feature
+4.499,postag[:2]:VB
+2.946,postag:IN
+2.946,postag[:2]:IN
+2.418,word.lower():and
+2.186,word[-2:]:ed
+2.143,word[-2:]:ne
+2.053,postag[:2]:RB
+2.022,word[-2:]:er
+2.002,word.istitle()
+1.973,bias

Weight?,Feature
+2.822,word.lower():—
+2.822,word[-3:]:—
+2.822,word[-2:]:—
+2.728,postag:.
+2.728,postag[:2]:.
+2.317,word[-2:]:/
+2.317,word.lower():/
+2.317,word[-3:]:/
+1.815,word.lower():)
+1.815,word[-3:]:)

Weight?,Feature
0.831,-1:word.lower():isbn
0.813,-1:word.isupper()
0.383,-1:postag:NN
0.309,-1:postag[:2]:NN
0.197,postag[:2]:JJ
0.197,postag:JJ
0.186,+1:word.istitle()
0.176,postag:CD
0.176,postag[:2]:CD
0.084,+1:postag:NN

Weight?,Feature
0.186,postag:CD
0.186,postag[:2]:CD
0.127,-1:postag:IN
0.127,-1:postag[:2]:IN
0.049,-1:word.lower():at
-0.002,-1:postag[:2]:CD
-0.002,-1:postag:CD
-0.011,BOS
-0.016,word.isdigit()
-0.023,+1:word.istitle()

Weight?,Feature
+2.485,word[-3:]:&
+2.485,word[-2:]:&
+2.485,word.lower():&
+1.124,word[-2:]:pp
+1.119,word.lower():pp
+1.117,word[-3:]:pp
+1.060,postag:CC
+1.060,postag[:2]:CC
+0.841,word[-2:]:-
+0.841,word[-3:]:-


In [38]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)

From \ To,ADDRESS,CARDINAL,DATE,DECIMAL,DIGIT,ELECTRONIC,FRACTION,LETTERS,MEASURE,MONEY,ORDINAL,PLAIN,PUNCT,TELEPHONE,TIME,VERBATIM
ADDRESS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.289,-0.128,0.0,0.0,0.0
CARDINAL,0.0,0.129,-0.118,0.0,0.0,0.0,0.0,0.004,0.0,0.0,0.0,0.229,0.194,0.0,0.0,0.168
DATE,0.0,-0.061,-0.237,0.0,0.0,0.0,0.0,-0.127,0.0,0.0,0.0,0.199,0.282,0.0,0.0,-0.111
DECIMAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.204,0.0,0.0,0.0,0.0
DIGIT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.723,0.0,0.0,0.0,-0.232,-0.026,0.0,0.0,0.0
ELECTRONIC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.082,0.306,0.0,0.0,0.0
FRACTION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.256,-0.039,0.0,0.0,0.0
LETTERS,0.0,0.174,-0.265,-0.003,0.711,0.0,0.0,-0.704,-0.016,0.0,-0.0,0.5,-0.093,0.731,0.0,-1.05
MEASURE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.268,0.0,0.0,-0.002
MONEY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013,0.115,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15
+0.049,word.isupper(),,,,,,,,,,,,,,
… 10 more negative …,… 10 more negative …,,,,,,,,,,,,,,
-0.038,+1:postag:NN,,,,,,,,,,,,,,
-0.044,-1:postag:NN,,,,,,,,,,,,,,
-0.046,-1:word.istitle(),,,,,,,,,,,,,,
-0.056,postag:CD,,,,,,,,,,,,,,
-0.056,postag[:2]:CD,,,,,,,,,,,,,,
-0.135,+1:postag[:2]:NN,,,,,,,,,,,,,,
-0.145,-1:postag[:2]:NN,,,,,,,,,,,,,,
-0.159,postag[:2]:NN,,,,,,,,,,,,,,

Weight?,Feature
+0.049,word.isupper()
… 10 more negative …,… 10 more negative …
-0.038,+1:postag:NN
-0.044,-1:postag:NN
-0.046,-1:word.istitle()
-0.056,postag:CD
-0.056,postag[:2]:CD
-0.135,+1:postag[:2]:NN
-0.145,-1:postag[:2]:NN
-0.159,postag[:2]:NN

Weight?,Feature
+2.173,word.isdigit()
+1.509,postag:CD
+1.509,postag[:2]:CD
+1.024,word[-2:]:00
+0.951,word[-2:]:II
+0.854,word.isupper()
+0.819,word[-2:]:1
+0.819,word.lower():1
+0.819,word[-3:]:1
+0.785,+1:postag:NNS

Weight?,Feature
+2.121,word.istitle()
+1.970,-1:word.lower():on
+1.920,word.isdigit()
+1.364,postag[:2]:CD
+1.364,postag:CD
+1.326,-1:word.lower():retrieved
+1.317,postag:NNP
… 231 more positive …,… 231 more positive …
… 80 more negative …,… 80 more negative …
-1.341,word.isupper()

Weight?,Feature
+1.124,postag:CD
+1.124,postag[:2]:CD
+0.572,-1:postag:VBD
+0.409,+1:postag:NNS
+0.397,-1:word.lower():was
+0.262,-1:postag[:2]:VB
+0.184,-1:word.lower():p
… 30 more positive …,… 30 more positive …
… 25 more negative …,… 25 more negative …
-0.199,bias

Weight?,Feature
+1.188,word.isdigit()
+0.868,postag:CD
+0.868,postag[:2]:CD
+0.803,-1:word.isupper()
+0.718,+1:word.isupper()
+0.614,-1:postag[:2]:NN
+0.397,-1:word.istitle()
+0.330,-1:postag:NN
… 11 more positive …,… 11 more positive …
… 13 more negative …,… 13 more negative …

Weight?,Feature
+0.737,postag:NN
+0.684,postag[:2]:NN
+0.683,word[-3:]:com
+0.676,word[-2:]:om
+0.095,BOS
+0.081,-1:postag:IN
+0.081,-1:postag[:2]:IN
… 19 more positive …,… 19 more positive …
… 6 more negative …,… 6 more negative …
-0.134,postag:CD

Weight?,Feature
+0.065,postag:CD
+0.065,postag[:2]:CD
… 9 more negative …,… 9 more negative …
-0.025,-1:postag:NN
-0.033,-1:word.istitle()
-0.056,+1:postag[:2]:NN
-0.065,-1:postag[:2]:NN
-0.117,postag:NN
-0.126,word.istitle()
-0.551,postag[:2]:NN

Weight?,Feature
+3.618,word.isupper()
+1.193,postag[:2]:NN
+0.899,postag:NNP
+0.770,word[-2:]:A.
+0.629,word[-2:]:S.
+0.542,+1:postag:CD
… 120 more positive …,… 120 more positive …
… 110 more negative …,… 110 more negative …
-0.555,word[-2:]:II
-0.753,word.isdigit()

Weight?,Feature
+1.640,postag[:2]:CD
+1.640,postag:CD
+0.492,+1:word.lower():of
+0.416,-1:word.lower():(
+0.416,-1:postag[:2]:(
+0.416,-1:postag:(
… 45 more positive …,… 45 more positive …
… 33 more negative …,… 33 more negative …
-0.509,postag:NN
-0.526,word.istitle()

Weight?,Feature
+1.467,postag[:2]:$
+1.467,postag:$
+0.427,-1:postag[:2]:VB
+0.392,word[-3:]:ion
+0.389,word[-2:]:00
+0.318,-1:postag[:2]:IN
+0.318,-1:postag:IN
+0.315,word[-2:]:on
+0.279,word[-3:]:000
+0.272,bias

Weight?,Feature
+2.367,word[-2:]:th
+0.585,postag:CD
+0.585,postag[:2]:CD
+0.505,+1:word.lower():century
+0.490,-1:word.lower():the
+0.487,postag:NNS
+0.474,+1:postag:NN
+0.414,+1:postag[:2]:NN
+0.339,-1:postag:DT
… 43 more positive …,… 43 more positive …

Weight?,Feature
+4.499,postag[:2]:VB
+2.946,postag[:2]:IN
+2.946,postag:IN
+2.418,word.lower():and
+2.186,word[-2:]:ed
+2.143,word[-2:]:ne
+2.053,postag[:2]:RB
+2.022,word[-2:]:er
+2.002,word.istitle()
… 493 more positive …,… 493 more positive …

Weight?,Feature
+2.822,word[-3:]:—
+2.822,word.lower():—
+2.822,word[-2:]:—
+2.728,postag[:2]:.
+2.728,postag:.
+2.317,word[-2:]:/
+2.317,word.lower():/
+2.317,word[-3:]:/
… 79 more positive …,… 79 more positive …
… 46 more negative …,… 46 more negative …

Weight?,Feature
+0.831,-1:word.lower():isbn
+0.813,-1:word.isupper()
+0.383,-1:postag:NN
+0.309,-1:postag[:2]:NN
+0.197,postag[:2]:JJ
+0.197,postag:JJ
+0.186,+1:word.istitle()
+0.176,postag:CD
+0.176,postag[:2]:CD
… 8 more positive …,… 8 more positive …

Weight?,Feature
+0.186,postag:CD
+0.186,postag[:2]:CD
+0.127,-1:postag[:2]:IN
+0.127,-1:postag:IN
… 1 more positive …,… 1 more positive …
… 8 more negative …,… 8 more negative …
-0.126,word.istitle()
-0.134,+1:postag[:2]:NN
-0.134,postag:NN
-0.147,-1:postag[:2]:NN

Weight?,Feature
+2.485,word[-3:]:&
+2.485,word.lower():&
+2.485,word[-2:]:&
+1.124,word[-2:]:pp
+1.119,word.lower():pp
+1.117,word[-3:]:pp
+1.060,postag[:2]:CC
+1.060,postag:CC
… 104 more positive …,… 104 more positive …
… 60 more negative …,… 60 more negative …


In [41]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=True, show=['targets'])

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15
+0.049,word.isupper(),,,,,,,,,,,,,,
-0.005,word.isdigit(),,,,,,,,,,,,,,
-0.016,word.istitle(),,,,,,,,,,,,,,
+2.173,word.isdigit(),,,,,,,,,,,,,,
+0.854,word.isupper(),,,,,,,,,,,,,,
-0.672,word.istitle(),,,,,,,,,,,,,,
+2.121,word.istitle(),,,,,,,,,,,,,,
+1.920,word.isdigit(),,,,,,,,,,,,,,
-1.341,word.isupper(),,,,,,,,,,,,,,
-0.037,word.isupper(),,,,,,,,,,,,,,

Weight?,Feature
0.049,word.isupper()
-0.005,word.isdigit()
-0.016,word.istitle()

Weight?,Feature
2.173,word.isdigit()
0.854,word.isupper()
-0.672,word.istitle()

Weight?,Feature
2.121,word.istitle()
1.92,word.isdigit()
-1.341,word.isupper()

Weight?,Feature
-0.037,word.isupper()
-0.303,word.isdigit()
-0.453,word.istitle()

Weight?,Feature
1.188,word.isdigit()
-0.008,word.isupper()
-0.177,word.istitle()

Weight?,Feature
-0.021,word.isupper()
-0.038,word.isdigit()
-0.22,word.istitle()

Weight?,Feature
-0.011,word.isdigit()
-0.126,word.istitle()

Weight?,Feature
3.618,word.isupper()
0.295,word.istitle()
-0.753,word.isdigit()

Weight?,Feature
-0.013,word.isupper()
-0.526,word.istitle()
-0.865,word.isdigit()

Weight?,Feature
-0.052,word.isdigit()
-0.076,word.istitle()

Weight?,Feature
0.206,word.isupper()
-0.241,word.istitle()
-0.426,word.isdigit()

Weight?,Feature
2.002,word.istitle()
-1.999,word.isupper()
-3.725,word.isdigit()

Weight?,Feature
-0.943,word.isupper()
-2.417,word.isdigit()
-2.511,word.istitle()

Weight?,Feature
-0.161,word.istitle()

Weight?,Feature
-0.016,word.isdigit()
-0.126,word.istitle()

Weight?,Feature
-0.402,word.isdigit()
-0.408,word.istitle()
-1.051,word.isupper()


In [43]:
len(crf.state_features_)

2187