In [1]:
# imports

%matplotlib inline
import matplotlib.pyplot as plt

import nltk
import spacy
import sklearn
import scipy.stats
import pandas as pd

from sklearn.metrics import make_scorer
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
# from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# matplot
plt.style.use('ggplot')

In [15]:
# file read

def conll_read(path: str):
    words = []
    bio_tags = []
    i = 0
    with open(path) as f:
        for line in f:
            i += 1
            splitted = line.strip().split('\t')
            if len(splitted) == 1:
                if splitted[0] == '':
                    words.append('\n')
                    bio_tags.append('\n')
                else:
                    # special case for last line of dev set
                    # maybe we just skip it altogether
                    words.append(splitted[0])
                    bio_tags.append('O')
            else:
                words.append(splitted[0])
                bio_tags.append(splitted[1])
    return words, bio_tags

In [3]:
# calls nltk 
def add_pos_tags(tokens: list):
    tagged_train = nltk.pos_tag(tokens)
    return zip(*tagged_train)

In [4]:
# create the sentences with their features
def create_sents(seqs):
    seperators = [i for i, item in enumerate(seqs) if item[0] == '\n']
    sents = []
    for idx, pos in enumerate(seperators):
        start = seperators[idx - 1] + 1
        end = seperators[idx]
        
        if idx == 0:
            start = 0
            end = pos
    
        sequence = seqs[start: end]
        sents.append(sequence)
    return sents

In [5]:
# train set
words, bio_tags = conll_read('W-NUT_data/wnut17train.conll')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
train_sequences = create_sents(complete)

In [6]:
# dev set
words, bio_tags = conll_read('W-NUT_data/emerging.dev.conll')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
dev_sequences = create_sents(complete)

In [7]:
# test set
words, bio_tags = conll_read('W-NUT_data/emerging.test.annotated')
words, pos_tags = add_pos_tags(words)
complete = list(zip(words, pos_tags, bio_tags))
test_sequences = create_sents(complete)

In [8]:
# slightly modified functions from the tutorial

def preword2feat(sent, i, extended):
    features = word2features(sent, i)
    if extended:
        features = extended_features(features=features, sent=sent, i=i)
    return features

def extended_features(features, sent, i):
    features = word2features(sent, i)

    word = sent[i][0]
    postag = sent[i][1]

    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
        })

    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
    return features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent, extended):
    return [preword2feat(sent, i, extended=extended) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

# Initial Features

In [13]:
# create X and y for train, dev, test

X_train = [sent2features(s, False) for s in train_sequences]
y_train = [sent2labels(s) for s in train_sequences]

X_dev = [sent2features(s, False) for s in dev_sequences]
y_dev = [sent2labels(s) for s in dev_sequences]

X_test = [sent2features(s, False) for s in test_sequences]
y_test = [sent2labels(s) for s in test_sequences]

## Train

In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [11]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-location',
 'I-location',
 'B-group',
 'B-corporation',
 'B-person',
 'B-creative-work',
 'B-product',
 'I-person',
 'I-creative-work',
 'I-corporation',
 'I-group',
 'I-product']

In [12]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.13829694021844366

## Hyper-parameter Optimization

In [14]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

rs.fit(X_dev, y_dev)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   25.1s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None, gamma=None,
                                 keep_...
                                        'c2': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x29bc3d900>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False,
                   scoring=make_scorer(flat_f1_score, average=wei

In [19]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best_c1 = rs.best_params_["c1"]
best_c2 = rs.best_params_["c1"]

best params: {'c1': 0.010935853729272638, 'c2': 0.046412902217484155}
best CV score: 0.3511095623206673
model size: 0.60M


In [18]:
y_pred = rs.best_estimator_.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.19469972772618196

# Extended Features

In [23]:
X_train_ext = [sent2features(s, True) for s in train_sequences]
y_train_ext = [sent2labels(s) for s in train_sequences]

X_dev_ext = [sent2features(s, True) for s in dev_sequences]
y_dev_ext = [sent2labels(s) for s in dev_sequences]

X_test_ext = [sent2features(s, True) for s in test_sequences]
y_test_ext = [sent2labels(s) for s in test_sequences]

## Train

In [24]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=best_c1,
    c2=best_c2,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_dev_ext, y_dev_ext)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.010935853729272638, c2=0.010935853729272638,
    calibration_candidates=None, calibration_eta=None,
    calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [25]:
y_pred = crf.predict(X_test_ext)
metrics.flat_f1_score(y_test_ext, y_pred,
                      average='weighted', labels=labels)

0.19148315998168136