In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Reference
# https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
# https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

path = r'/home/rserver/Data_Mining/personal_workspace/yz/Lab/CkipTagger/'

In [63]:
# 新增工作資料夾
path_resource = path + '/Resource'
path_function = path + '/Function'
path_temp = path + '/Temp'
path_export = path + '/Export'


In [9]:
df = pd.read_csv(path_resource + '/ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:100000]
df.head()
df.isnull().sum()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


## Data Preprocessing

In [10]:
df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(4544, 10922, 17)

In [11]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,75
1,B-eve,53
2,B-geo,3303
3,B-gpe,1740
4,B-nat,30
5,B-org,1876
6,B-per,1668
7,B-tim,1823
8,I-art,43
9,I-eve,47


In [12]:
X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

((67000, 15507), (67000,))

## Out-of-core Algorithms
### Perceptron

In [13]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1-- Epoch 1
-- Epoch 1-- Epoch 1


-- Epoch 1-- Epoch 1

Norm: 8.43, NNZs: 57, Bias: -3.000000, T: 67000, Avg. loss: 0.000567
Total training time: 2.19 seconds.
-- Epoch 1Norm: 49.90, NNZs: 1337, Bias: -4.000000, T: 67000, Avg. loss: 0.015328
Total training time: 2.28 seconds.

Norm: 13.42, NNZs: 162, Bias: -4.000000, T: 67000, Avg. loss: 0.001642
Total training time: 2.22 seconds.
-- Epoch 1
-- Epoch 1
Norm: 11.53, NNZs: 113, Bias: -3.000000, T: 67000, Avg. loss: 0.001060
Total training time: 2.35 seconds.
-- Epoch 1
Norm: 56.87, NNZs: 2044, Bias: -4.000000, T: 67000, Avg. loss: 0.034970
Total training time: 2.35 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   4 out of  17 | elapsed:    2.5s remaining:    8.0s


Norm: 68.07, NNZs: 2642, Bias: -4.000000, T: 67000, Avg. loss: 0.041776
Total training time: 2.54 seconds.
-- Epoch 1
Norm: 44.41, NNZs: 1127, Bias: -4.000000, T: 67000, Avg. loss: 0.017164
Total training time: 2.50 seconds.
-- Epoch 1
Norm: 48.83, NNZs: 1578, Bias: -4.000000, T: 67000, Avg. loss: 0.022328
Total training time: 2.49 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   6 out of  17 | elapsed:    2.7s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done   8 out of  17 | elapsed:    2.8s remaining:    3.1s


Norm: 10.44, NNZs: 106, Bias: -3.000000, T: 67000, Avg. loss: 0.001060
Total training time: 2.34 seconds.
Norm: 11.45, NNZs: 96, Bias: -3.000000, T: 67000, Avg. loss: 0.000776
Total training time: 2.33 seconds.
-- Epoch 1
Norm: 6.24, NNZs: 31, Bias: -3.000000, T: 67000, Avg. loss: 0.000209
Total training time: 2.33 seconds.
Norm: 11.00, NNZs: 102, Bias: -3.000000, T: 67000, Avg. loss: 0.001209
Total training time: 2.42 seconds.


[Parallel(n_jobs=-1)]: Done  10 out of  17 | elapsed:    4.8s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    4.9s remaining:    2.0s


Norm: 35.13, NNZs: 803, Bias: -4.000000, T: 67000, Avg. loss: 0.011149
Total training time: 2.55 seconds.
Norm: 30.53, NNZs: 672, Bias: -4.000000, T: 67000, Avg. loss: 0.012030
Total training time: 2.21 seconds.
Norm: 53.57, NNZs: 1703, Bias: -4.000000, T: 67000, Avg. loss: 0.026224
Total training time: 2.39 seconds.
Norm: 60.35, NNZs: 2091, Bias: -6.000000, T: 67000, Avg. loss: 0.026940
Total training time: 2.38 seconds.


[Parallel(n_jobs=-1)]: Done  14 out of  17 | elapsed:    5.0s remaining:    1.1s


Norm: 73.89, NNZs: 2851, Bias: 4.000000, T: 67000, Avg. loss: 0.048866
Total training time: 2.22 seconds.


[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:    7.0s finished


Perceptron(max_iter=5, n_jobs=-1, verbose=10)

In [14]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

In [15]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        24
       B-eve       0.11      0.05      0.07        19
       B-geo       0.56      0.81      0.66      1085
       B-gpe       0.92      0.78      0.84       556
       B-nat       1.00      0.17      0.29        12
       B-org       0.39      0.52      0.44       589
       B-per       0.70      0.46      0.56       564
       B-tim       0.91      0.63      0.75       611
       I-art       0.00      0.00      0.00        12
       I-eve       0.67      0.22      0.33        18
       I-geo       0.75      0.42      0.54       230
       I-gpe       1.00      0.07      0.13        14
       I-nat       0.50      0.50      0.50         2
       I-org       0.48      0.50      0.49       445
       I-per       0.83      0.13      0.22       591
       I-tim       0.36      0.18      0.24       194

   micro avg       0.61      0.54      0.58      4966
   macro avg       0.57   

  _warn_prf(average, modifier, msg_start, len(result))


### Linear classifiers with SGD training

In [16]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

SGDClassifier()

In [17]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       1.00      0.04      0.08        24
       B-eve       0.00      0.00      0.00        19
       B-geo       0.67      0.81      0.73      1085
       B-gpe       0.93      0.64      0.76       556
       B-nat       0.00      0.00      0.00        12
       B-org       0.61      0.37      0.46       589
       B-per       0.70      0.48      0.57       564
       B-tim       0.93      0.62      0.74       611
       I-art       0.00      0.00      0.00        12
       I-eve       0.43      0.17      0.24        18
       I-geo       0.79      0.50      0.61       230
       I-gpe       0.00      0.00      0.00        14
       I-nat       0.00      0.00      0.00         2
       I-org       0.29      0.70      0.41       445
       I-per       0.73      0.45      0.56       591
       I-tim       0.33      0.01      0.01       194

   micro avg       0.63      0.56      0.59      4966
   macro avg       0.46   

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
### Naive Bayes classifier for multinomial models

In [19]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01)

In [20]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

              precision    recall  f1-score   support

       B-art       0.06      0.17      0.09        24
       B-eve       0.33      0.37      0.35        19
       B-geo       0.70      0.63      0.66      1085
       B-gpe       0.70      0.83      0.76       556
       B-nat       0.35      0.50      0.41        12
       B-org       0.41      0.44      0.43       589
       B-per       0.44      0.47      0.46       564
       B-tim       0.56      0.61      0.59       611
       I-art       0.07      0.08      0.08        12
       I-eve       0.46      0.33      0.39        18
       I-geo       0.40      0.52      0.46       230
       I-gpe       0.13      0.14      0.14        14
       I-nat       0.00      0.00      0.00         2
       I-org       0.50      0.51      0.51       445
       I-per       0.53      0.50      0.51       591
       I-tim       0.17      0.27      0.21       194

   micro avg       0.52      0.56      0.54      4966
   macro avg       0.36   

### Passive Aggressive Classifier

In [21]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

PassiveAggressiveClassifier()

In [22]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        24
       B-eve       0.50      0.05      0.10        19
       B-geo       0.69      0.82      0.75      1085
       B-gpe       0.96      0.76      0.85       556
       B-nat       0.42      0.42      0.42        12
       B-org       0.69      0.31      0.43       589
       B-per       0.65      0.53      0.58       564
       B-tim       0.82      0.68      0.74       611
       I-art       0.00      0.00      0.00        12
       I-eve       1.00      0.17      0.29        18
       I-geo       0.81      0.35      0.49       230
       I-gpe       0.71      0.36      0.48        14
       I-nat       0.25      0.50      0.33         2
       I-org       0.34      0.73      0.46       445
       I-per       0.62      0.62      0.62       591
       I-tim       0.31      0.22      0.26       194

   micro avg       0.63      0.61      0.62      4966
   macro avg       0.55   

  _warn_prf(average, modifier, msg_start, len(result))


## Conditional Random Fields (CRFs)
### sklearn-crfsuite

In [27]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [28]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

### Feature Extraction

In [30]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
        
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]


### Split train and test sets

In [31]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### Train a CRF model

In [95]:
X_train[0:5]

[[{'bias': 1.0,
   'word.lower()': 'a',
   'word[-3:]': 'A',
   'word[-2:]': 'A',
   'word.isupper()': True,
   'word.istitle()': True,
   'word.isdigit()': False,
   'postag': 'DT',
   'postag[:2]': 'DT',
   'BOS': True,
   '+1:word.lower()': 'second',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'JJ',
   '+1:postag[:2]': 'JJ'},
  {'bias': 1.0,
   'word.lower()': 'second',
   'word[-3:]': 'ond',
   'word[-2:]': 'nd',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'JJ',
   'postag[:2]': 'JJ',
   '-1:word.lower()': 'a',
   '-1:word.istitle()': True,
   '-1:word.isupper()': True,
   '-1:postag': 'DT',
   '-1:postag[:2]': 'DT',
   '+1:word.lower()': 'nominee',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'NN',
   '+1:postag[:2]': 'NN'},
  {'bias': 1.0,
   'word.lower()': 'nominee',
   'word[-3:]': 'nee',
   'word[-2:]': 'ee',
   'word.isupper()': False,
   'word.istitle()': 

In [49]:
# y_train[0:10]

In [96]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [97]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [98]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [99]:
trainer.train(path_export + '/conll2002-esp.crfsuite')

In [100]:
trainer.logparser.last_iteration

{'num': 50,
 'scores': {},
 'loss': 6485.580681,
 'feature_norm': 52.650711,
 'error_norm': 108.470127,
 'active_features': 3938,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.21}

In [101]:
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

50 {'num': 50, 'scores': {}, 'loss': 6485.580681, 'feature_norm': 52.650711, 'error_norm': 108.470127, 'active_features': 3938, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.21}


## Make predictions

To use the trained model, create pycrfsuite.Tagger, open the model and use "tag" method:

In [65]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x7faf00902550>

Let's tag a sentence to see how it works:

In [None]:
# X = [sent2features(s) for s in sentences]
# y = [sent2labels(s) for s in sentences]

In [84]:
sent2features(example_sent)

AttributeError: 'dict' object has no attribute 'lower'

In [117]:
example_sent = X_test[0]
print("Predicted:", ' '.join(tagger.tag(example_sent)))

Predicted: O O O O O O O O B-tim O O O O O O O O O O O O O O O O B-tim O


## Evaluate the model

In [107]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

Predict entity labels for all sentences in our testing set ('testb' Spanish data):

In [108]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 627 ms, sys: 3.9 ms, total: 631 ms
Wall time: 630 ms


..and check the result. Note this report is not comparable to results in CONLL2002 papers because here we check per-token results (not per-entity). Per-entity numbers will be worse.  

In [112]:
X_test[:5]

[[{'bias': 1.0,
   'word.lower()': 'following',
   'word[-3:]': 'ing',
   'word[-2:]': 'ng',
   'word.isupper()': False,
   'word.istitle()': True,
   'word.isdigit()': False,
   'postag': 'VBG',
   'postag[:2]': 'VB',
   'BOS': True,
   '+1:word.lower()': 'a',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'DT',
   '+1:postag[:2]': 'DT'},
  {'bias': 1.0,
   'word.lower()': 'a',
   'word[-3:]': 'a',
   'word[-2:]': 'a',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'DT',
   'postag[:2]': 'DT',
   '-1:word.lower()': 'following',
   '-1:word.istitle()': True,
   '-1:word.isupper()': False,
   '-1:postag': 'VBG',
   '-1:postag[:2]': 'VB',
   '+1:word.lower()': 'disastrous',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'JJ',
   '+1:postag[:2]': 'JJ'},
  {'bias': 1.0,
   'word.lower()': 'disastrous',
   'word[-3:]': 'ous',
   'word[-2:]': 'us',
   'word.isupper()': False,
   'w

In [110]:
# print(bio_classification_report(y_test, y_pred))

NameError: name 'LabelBinarizer' is not defined

## Let's check what classifier learned

In [18]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
B-ORG  -> I-ORG   8.631963
I-ORG  -> I-ORG   7.833706
B-PER  -> I-PER   6.998706
B-LOC  -> I-LOC   6.913675
I-MISC -> I-MISC  6.129735
B-MISC -> I-MISC  5.538291
I-LOC  -> I-LOC   4.983567
I-PER  -> I-PER   3.748358
B-ORG  -> B-LOC   1.727090
B-PER  -> B-LOC   1.388267
B-LOC  -> B-LOC   1.240278
O      -> O       1.197929
O      -> B-ORG   1.097062
I-PER  -> B-LOC   1.083332
O      -> B-MISC  1.046113

Top unlikely transitions:
I-PER  -> B-ORG   -2.056130
I-LOC  -> I-ORG   -2.143940
B-ORG  -> I-MISC  -2.167501
I-PER  -> I-ORG   -2.369380
B-ORG  -> I-PER   -2.378110
I-MISC -> I-PER   -2.458788
B-LOC  -> I-PER   -2.516414
I-ORG  -> I-MISC  -2.571973
I-LOC  -> B-PER   -2.697791
I-LOC  -> I-PER   -3.065950
I-ORG  -> I-PER   -3.364434
O      -> I-PER   -7.322841
O      -> I-MISC  -7.648246
O      -> I-ORG   -8.024126
O      -> I-LOC   -8.333815


We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized. Also note I-PER -> B-LOC transition: a positive weight means that model thinks that a person name is often followed by a location.

Check the state features:

In [19]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
8.886516 B-ORG  word.lower=efe-cantabria
8.743642 B-ORG  word.lower=psoe-progresistas
5.769032 B-LOC  -1:word.lower=cantabria
5.195429 I-LOC  -1:word.lower=calle
5.116821 O      word.lower=mayo
4.990871 O      -1:word.lower=día
4.910915 I-ORG  -1:word.lower=l
4.721572 B-MISC word.lower=diversia
4.676259 B-ORG  word.lower=telefónica
4.334354 B-ORG  word[-2:]=-e
4.149862 B-ORG  word.lower=amena
4.141370 B-ORG  word.lower=terra
3.942852 O      word.istitle=False
3.926397 B-ORG  word.lower=continente
3.924672 B-ORG  word.lower=acesa
3.888706 O      word.lower=euro
3.856445 B-PER  -1:word.lower=según
3.812373 B-MISC word.lower=exteriores
3.807582 I-MISC -1:word.lower=1.9
3.807098 B-MISC word.lower=sanidad

Top negative:
-1.965379 O      word.lower=fundación
-1.981541 O      -1:word.lower=británica
-2.118347 O      word.lower=061
-2.190653 B-PER  word[-3:]=nes
-2.226373 B-ORG  postag=SP
-2.226373 B-ORG  postag[:2]=SP
-2.260972 O      word[-3:]=uia
-2.384920 O      -1:word.lower