Adapted from: https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

In [None]:
from itertools import chain
import numpy as np
import pandas as pd
import nltk

import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer

from tensorflow.keras import Model, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (LSTM, Embedding, Dense, TimeDistributed,
                                     Dropout, Bidirectional)

from matplotlib import pyplot as plt
%matplotlib inline

# Let's use CoNLL 2002 data to build a NER system

CoNLL2002 corpus is available in NLTK. We use Spanish data.

In [None]:
nltk.corpus.conll2002.fileids()

In [None]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

Data format:

In [None]:
train_sents[10]

## Prepare data for Keras

In [None]:
VOCAB_SIZE = 5000
EMBEDDING_DIM = 32
LSTM_HIDDEN_LAYER_SIZE = 64
MAXLEN = 100

In [None]:
sentences_train = [[y[0] for y in x] for x in train_sents]
sentences_test = [[y[0] for y in x] for x in test_sents]

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

In [None]:
X_train = pad_sequences(X_train, padding='post', maxlen=MAXLEN)
X_test = pad_sequences(X_test, padding='post', maxlen=MAXLEN)

In [None]:
X_train[10]

In [None]:
X_train[10].shape

In [None]:
targets_train = [[y[2] for y in x] for x in train_sents]
targets_test = [[y[2] for y in x] for x in test_sents]

tags = sorted(list(set([y for x in targets_train for y in x] \
                       + [y for x in targets_test for y in x])))
tag2idx = {t: i for i, t in enumerate(tags)}

targets_train = [[tag2idx[w] for w in s] for s in targets_train]
targets_test = [[tag2idx[w] for w in s] for s in targets_test]

y_train = pad_sequences(maxlen=MAXLEN, sequences=targets_train, padding="post", value=tag2idx["O"])
y_test = pad_sequences(maxlen=MAXLEN, sequences=targets_test, padding="post", value=tag2idx["O"])

In [None]:
y_train[10]

In [None]:
y_train = [to_categorical(i, num_classes=len(tags)) for i in y_train]
y_test = [to_categorical(i, num_classes=len(tags)) for i in y_test]

## Create Keras model

In [None]:
input = Input(shape=(MAXLEN,))
model = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAXLEN)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=LSTM_HIDDEN_LAYER_SIZE, return_sequences=True, 
                           recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(len(tags), activation="softmax"))(model)

model = Model(input, out)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])


In [None]:
model.summary()

## Train model

In [None]:
%%time
epochs = 5
# epochs = 50
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=epochs, 
                    validation_split=0.1, verbose=1)

In [None]:
hist = pd.DataFrame(history.history)
plt.figure(figsize=(6,5))
plt.plot(hist["accuracy"])
plt.plot(hist["val_accuracy"])
plt.show()

## Evaluate model

In [None]:
loss, accuracy = model.evaluate(X_train, np.array(y_train), verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, np.array(y_test), verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
predictions = model.predict(X_test, verbose=False)

In [None]:
y_pred = [[tags[x] for i, x in enumerate(np.argmax(pred, axis=1)) if i < len(test_sents[j])]
          for j, pred in enumerate(predictions)]

In [None]:
y_test_gold = [[y[2] for i, y in enumerate(x) if i < len(y_pred[j])] 
               for j, x in enumerate(test_sents)]

In [None]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [None]:
print(bio_classification_report(y_test_gold, y_pred))