In [9]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [10]:
def read_data(file_path):
    tokens = []
    tags = []
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            # Replace all urls with <URL> token
            # Replace all users with <USR> token

            ######################################
            ######### YOUR CODE HERE #############
            ######################################
            if token.startswith("http://") or token.startswith("https://"): token = "<URL>"
            elif token.startswith("@"): token = "<USR>"
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags


In [2]:
train_tokens, train_tags = read_data('data/train.txt')
validation_tokens, validation_tags = read_data('data/validation.txt')
test_tokens, test_tags = read_data('data/test.txt')

In [19]:
def build_sentence(tokens, tags):
    pos_tags = [item[-1] for item in nltk.pos_tag(tokens)]
    return list(zip(tokens, pos_tags, tags))

def build_sentences(tokens_set, tags_set):
    return [build_sentence(tokens, tags) for tokens, tags in zip(tokens_set, tags_set)]

train_sents = build_sentences(train_tokens, train_tags)
validation_sents = build_sentences(validation_tokens, validation_tags)
test_sents = build_sentences(test_tokens, test_tags)

In [21]:

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [22]:
X_train[0]

[{'+1:postag': 'NN',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': False,
  '+1:word.isupper()': True,
  '+1:word.lower()': '<usr>',
  'BOS': True,
  'bias': 1.0,
  'postag': 'NNP',
  'postag[:2]': 'NN',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': True,
  'word.lower()': 'rt',
  'word[-2:]': 'RT',
  'word[-3:]': 'RT'},
 {'+1:postag': ':',
  '+1:postag[:2]': ':',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': ':',
  '-1:postag': 'NNP',
  '-1:postag[:2]': 'NN',
  '-1:word.istitle()': False,
  '-1:word.isupper()': True,
  '-1:word.lower()': 'rt',
  'bias': 1.0,
  'postag': 'NN',
  'postag[:2]': 'NN',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': True,
  'word.lower()': '<usr>',
  'word[-2:]': 'R>',
  'word[-3:]': 'SR>'},
 {'+1:postag': 'NN',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'online',
  '-1:postag': 'NN',
  '-1:postag[:2]': 'N

In [12]:
train_sents[0]

[('De', 'Art', 'O'),
 ('tekst', 'N', 'O'),
 ('van', 'Prep', 'O'),
 ('het', 'Art', 'O'),
 ('arrest', 'N', 'O'),
 ('is', 'V', 'O'),
 ('nog', 'Adv', 'O'),
 ('niet', 'Adv', 'O'),
 ('schriftelijk', 'Adj', 'O'),
 ('beschikbaar', 'Adj', 'O'),
 ('maar', 'Conj', 'O'),
 ('het', 'Art', 'O'),
 ('bericht', 'N', 'O'),
 ('werd', 'V', 'O'),
 ('alvast', 'Adv', 'O'),
 ('bekendgemaakt', 'V', 'O'),
 ('door', 'Prep', 'O'),
 ('een', 'Art', 'O'),
 ('communicatiebureau', 'N', 'O'),
 ('dat', 'Conj', 'O'),
 ('Floralux', 'N', 'B-ORG'),
 ('inhuurde', 'V', 'O'),
 ('.', 'Punc', 'O')]

In [23]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.12275911459748372,
    c2=0.017524121896519664,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None,
  c1=0.12275911459748372, c2=0.017524121896519664,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [25]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-musicartist',
 'I-musicartist',
 'B-product',
 'I-product',
 'B-company',
 'B-person',
 'B-other',
 'I-other',
 'B-facility',
 'I-facility',
 'B-sportsteam',
 'B-geo-loc',
 'I-geo-loc',
 'I-company',
 'I-person',
 'B-movie',
 'I-movie',
 'B-tvshow',
 'I-tvshow',
 'I-sportsteam']

In [26]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.52816699164266145

In [27]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

               precision    recall  f1-score   support

    B-company      0.831     0.583     0.685        84
    I-company      0.821     0.575     0.676        40
   B-facility      0.750     0.511     0.608        47
   I-facility      0.792     0.623     0.697        61
    B-geo-loc      0.843     0.618     0.713       165
    I-geo-loc      0.710     0.423     0.530        52
      B-movie      0.000     0.000     0.000         8
      I-movie      0.000     0.000     0.000        10
B-musicartist      0.250     0.074     0.114        27
I-musicartist      0.400     0.083     0.138        24
      B-other      0.581     0.417     0.486       103
      I-other      0.366     0.366     0.366        93
     B-person      0.683     0.538     0.602       104
     I-person      0.643     0.545     0.590        66
    B-product      0.800     0.143     0.242        28
    I-product      0.818     0.300     0.439        60
 B-sportsteam      0.727     0.258     0.381        31
 I-sports

In [28]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

# uncomment next line to effectively perform the search
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.2min
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None,
  c1=0.12275911459748372, c2=0.017524121896519664,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11cb07438>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11cfeef28>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [29]:
rs.best_params_

{'c1': 0.23234333238712476, 'c2': 0.019320154604152026}

In [30]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.23234333238712476,
    c2=0.019320154604152026,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None,
  c1=0.23234333238712476, c2=0.019320154604152026,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [31]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.5073321975812326

In [None]:
from sklearn.externals import joblib
joblib.dump(crf, 'Dutch.pkl')