In [1]:
from collections import namedtuple
OneWord = namedtuple("OneWord", ["word", "pos_label", "chunk_label", "entity_label"])


In [2]:
def read_conll2003_ner(f_name):
    """Yield complete sentences with NER labels"""
    current_sentence = []
    with open(f_name) as f:
        for line in f:
            line = line.strip()
            if line.startswith("-DOCSTART-"):
                continue
            if not line:
                if current_sentence:
                    yield current_sentence
                    current_sentence = []
                continue
            columns = line.split()
            assert len(columns) == 4
            current_sentence.append(OneWord(*columns))
        else:
            if current_sentence:
                yield current_sentence


In [3]:
def generate_ner_features(sent):
    sent_features = []
    for word_idx, one_word in enumerate(sent):
        word_features = {}
        word = one_word.word
        word_features["word_" + word] = 1
        word_features["lower_word_" + word.lower()] = 1
        word_features["is_capitalized"] = word.istitle()
        if word_idx != 0:
            word_features["prev_word_" + sent[word_idx - 1].word] = 1
        if word_idx != len(sent) - 1:
            word_features["next_word_" + sent[word_idx + 1].word] = 1
        sent_features.append(word_features)
    return sent_features


In [4]:
def prep_data(sentences):
    all_labels = []  # here we gather labels for all words in all sentences
    all_features = []  # here we gather features for all words in all sentences
    for sentence in sentences:
        sent_features = generate_ner_features(sentence)  # Use the NER feature generation function
        assert len(sent_features) == len(sentence)
        # Now we can get, for every position its label and its features
        for one_word, features in zip(sentence, sent_features):
            all_labels.append(one_word.entity_label)  # Use entity_label for NER
            all_features.append(features)  # and features to go with it
    return all_labels, all_features


In [5]:
from sklearn.feature_extraction import DictVectorizer


sentences_train_ner = list(read_conll2003_ner("dataset/train.txt"))
sentences_dev_ner = list(read_conll2003_ner("dataset/valid.txt"))

train_labels_ner, train_features_ner = prep_data(sentences_train_ner)
dev_labels_ner, dev_features_ner = prep_data(sentences_dev_ner)

vectorizer_ner = DictVectorizer()
vectorizer_ner.fit(train_features_ner)

feature_vectors_train_ner = vectorizer_ner.transform(train_features_ner)
feature_vectors_dev_ner = vectorizer_ner.transform(dev_features_ner)


In [6]:
from sklearn.svm import LinearSVC

ner_classifier = LinearSVC(C=1, verbose=1)
ner_classifier.fit(feature_vectors_train_ner, train_labels_ner)


[LibLinear]

In [7]:
ner_accuracy = ner_classifier.score(feature_vectors_dev_ner, dev_labels_ner)
print("NER Model Accuracy:", ner_accuracy)


NER Model Accuracy: 0.9694326544916475


In [8]:
def predict_named_entities(sentence, classifier, vectorizer):
    sentence_data = [OneWord(w, "wrong", "yellow", "red") for w in sentence]
    _, sentence_features = prep_data([sentence_data])
    sentence_vectors = vectorizer.transform(sentence_features)
    predictions = classifier.predict(sentence_vectors)
    return list(zip(sentence, predictions))


In [9]:
sentence = "Barack Obama was born in Hawaii .".split()
ner_predictions = predict_named_entities(sentence, ner_classifier, vectorizer_ner)
print(ner_predictions)


[('Barack', 'B-PER'), ('Obama', 'B-PER'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii', 'B-LOC'), ('.', 'O')]
