In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Reference
# https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
# https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

host = 0
# host = 4

if host == 0:
    path = '/Users/aron/Documents/GitHub/Perfume/2_NLP'
elif host == 4:
    path = r'/home/rserver/Data_Mining/personal_workspace/yz/Lab/CkipTagger/'

In [3]:
# 新增工作資料夾
path_resource = path + '/Resource'
path_function = path + '/Function'
path_temp = path + '/Temp'
path_export = path + '/Export'

In [None]:
df = pd.read_csv(path_resource + '/ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:100000]
df.head()
df.isnull().sum()

## Data Preprocessing

In [None]:
df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

In [None]:
df.groupby('Tag').size().reset_index(name='counts')

In [None]:
X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

## Out-of-core Algorithms
### Perceptron

In [None]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

In [None]:
new_classes = classes.copy()
new_classes.pop()
new_classes

In [None]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

### Linear classifiers with SGD training

In [None]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

In [None]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

In [None]:
### Naive Bayes classifier for multinomial models

In [None]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

In [None]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

### Passive Aggressive Classifier

In [None]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

In [None]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

## Conditional Random Fields (CRFs)
### sklearn-crfsuite

In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

### Feature Extraction

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
        
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]


### Split train and test sets

In [None]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### Train a CRF model

In [None]:
X_train[0:5]

In [None]:
# y_train[0:10]

In [None]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
trainer.params()

In [None]:
trainer.train(path_export + '/conll2002-esp.crfsuite')

In [None]:
trainer.logparser.last_iteration

In [None]:
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

## Make predictions

To use the trained model, create pycrfsuite.Tagger, open the model and use "tag" method:

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

Let's tag a sentence to see how it works:

In [None]:
# X = [sent2features(s) for s in sentences]
# y = [sent2labels(s) for s in sentences]

In [None]:
sent2features(example_sent)

In [None]:
example_sent = X_test[0]
print("Predicted:", ' '.join(tagger.tag(example_sent)))

## Evaluate the model

In [None]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

Predict entity labels for all sentences in our testing set ('testb' Spanish data):

In [None]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

..and check the result. Note this report is not comparable to results in CONLL2002 papers because here we check per-token results (not per-entity). Per-entity numbers will be worse.  

In [None]:
X_test[:5]

In [None]:
# print(bio_classification_report(y_test, y_pred))

## Let's check what classifier learned

In [None]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized. Also note I-PER -> B-LOC transition: a positive weight means that model thinks that a person name is often followed by a location.

Check the state features:

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])