# Named Entity Recognition using Conditional Random Fields.

# Features have been chosen and multiple models have been created incrementally to test the accuracy.
## 1. Pos tag of the current, previous and next word.
## 2. Pos tag and Chunk tag of the current, previous and next word.
## 3. Pos tag and Case check of the first character (bool), Digit check (bool) for each word- current, previous and next word.

## Note : Other features can be chosen to test the model but this currently I have restricted the model to these three set of features.


# Requires Sklearn CRF Suite, NLTK and Sklearn.
# Created using Pycharm. Add package sklearn-crfsuite.

# Create new models by specifying new_model name and running the train_model() method.

In [1]:
import sys
import sklearn_crfsuite
import nltk
from sklearn_crfsuite import metrics
from sklearn.externals import joblib

In [2]:
# Provide model_name to save
# model_name = 'ner_crf_model_pos'
# model_name = 'ner_crf_model_pos_chunk'
model_name = 'ner_crf_model_pos_chunk_case_digit'

In [3]:
# Provide chunking model name
chunking_model_name = 'chunking_crf_model_pos_next_previous_word_case_start_word'

In [4]:
# New model name
new_model = 'ner_crf_model'

In [5]:
# Get sentence count.
def get_sentence_count(filepath):
    count = 0
    with open(filepath) as chunking_data:
        for line in chunking_data.readlines():
            if 0 == len(line.strip()):
                count += 1
    print ('Total number of sentences : ', count)
    return count

In [6]:
# Get test and train split given filepath.
def get_sentences_from_file(filepath, tt_division=0.7):
    sentence = []
    train = []
    test = []
    count = 0
    sentence_count = get_sentence_count(filepath)
    sentence_count = sentence_count * tt_division

    with open(filepath) as chunking_data:
        for line in chunking_data.readlines():
            if 'DOCSTART' in line:
                continue
            if 0 == len(line.strip()):
                if count <= sentence_count:
                    train.append(sentence)
                else:
                    test.append(sentence)
                count += 1
                sentence = []
                continue
            else:
                parts = line.split(' ')
                temp = (parts[0].strip(), parts[1].strip(), parts[2].strip(), parts[3].strip())

            sentence.append(temp)
    return train, test

In [7]:
# Features : POS TAG and CHUNK TAG of the current word
def word2features_pos_chunk(sentence, i):
    pos = sentence[i][1]
    chunk = sentence[i][2]
    features = {
        'pos': pos,
        'chunk': chunk,
    }

    return features

In [8]:
# Features - POS
# Previous and Next Word
# Previous word Next Word POS tag
def word2features_pos(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'postag': postag,
    }

    # Get previous word and pos tag
    if i > 0:
        previous_word = sent[i-1][0]
        previous_postag = sent[i-1][1]
        features.update({
            'previous_postag': previous_postag,
        })
    else:
        features['Start'] = True

    # Get next word and pos tag
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        next_postag = sent[i+1][1]
        features.update({
            'next_postag': next_postag,
        })
    else:
        features['End'] = True

    return features

In [9]:
# Features - POS
# Previous and Next Word
# Previous word Next Word POS tag
# Upper case bool for first letter of token
# Is number/digit bool
def word2features_pos_chunk_case_digit(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    chunktag = sent[i][2]
    features = {
        'isupper': word[0].isupper(),
        'isdigit': word.isdigit(),
        'postag': postag,
        'chunktag': chunktag,
    }
    if i > 0:
        previous_word = sent[i-1][0]
        previous_tag = sent[i-1][1]
        previous_chunktag = sent[i - 1][2]
        features.update({
            'previous_isupper': previous_word[0].isupper(),
            'previous_isdigit': previous_word.isdigit(),
            'previous_postag': previous_tag,
            'previous_chunktag': previous_chunktag,
        })
    else:
        features['Start'] = True

    if i < len(sent)-1:
        next_word = sent[i+1][0]
        next_postag = sent[i+1][1]
        next_chunktag = sent[i + 1][2]
        features.update({
            'next_isupper()': next_word[0].isupper(),
            'next_isdigit': next_word.isdigit(),
            'next_postag': next_postag,
            'next_chunktag': next_chunktag,
        })
    else:
        features['End'] = True

    return features

In [10]:
############################# Start Chunking Methods #############################
# Features - POS
# Previous word Next Word POS tag
# Previous and Next Word Case
def chunking_word2features_pos_next_previous_word_case(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'isupper': word[0].isupper(),
        'postag': postag,
    }

    # Get previous word and pos tag
    if i > 0:
        previous_word = sent[i-1][0]
        previous_postag = sent[i-1][1]
        features.update({
            'isupper': previous_word[0].isupper(),
            'previous_postag': previous_postag,
        })
    else:
        features['Start'] = True

    # Get next word and pos tag
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        next_postag = sent[i+1][1]
        features.update({
            'isupper': next_word[0].isupper(),
            'next_postag': next_postag,
        })
    else:
        features['End'] = True

    return features

# Get features for each word in the sentence.
def getSentencefeatures_for_chunking(sentence):
    return [chunking_word2features_pos_next_previous_word_case(sentence, i) for i in range(len(sentence))]

# Predicts on list of text.
# Returns Chunks from the sentence and and Predicted labels.
def generate_chunks(text_list):

    # Generate POS Tag for text list
    test_data = nltk.pos_tag(text_list)

    # Get Test sentences with features for sentence.
    # X_test = [getSentencefeatures(s) for s in test_data]
    X_test = [getSentencefeatures_for_chunking(test_data)]

    try:
        clf = joblib.load(chunking_model_name)
        y_pred = clf.predict(X_test)

        chunks = []
        chunk = ''
        tag_index = 0
        for i in range(len(y_pred[0])):
            if tag_index >= len(y_pred[0]):
                break
            if y_pred[0][tag_index].startswith('O'):
                chunks[len(chunks)-1] = chunks[len(chunks)-1] + text_list[tag_index]
                break
            if y_pred[0][tag_index].startswith('B-'):
                chunk += text_list[tag_index] + ' '
                next_tag_index = tag_index + 1
                for j in range(len(y_pred[0])):
                    if next_tag_index >= len(y_pred[0]):
                        chunks.append(chunk)
                        chunk = ''
                        tag_index = next_tag_index
                        break
                    if y_pred[0][next_tag_index].startswith('I-'):
                        chunk += text_list[next_tag_index] + ' '
                        next_tag_index = next_tag_index + 1
                    else:
                        chunks.append(chunk)
                        chunk = ''
                        tag_index = next_tag_index
                        break

        return chunks, test_data, y_pred
    except 'AttributeError':
        print('There was an exception!', sys.exc_info()[0])

# Merges the text list with the chunk tags
def merge_with_chunk_tags(text_list, chunk_tags):
    for i in range(len(text_list)):
        text_list[i] = (text_list[i][0], text_list[i][1], chunk_tags[0][i])

    return text_list

############################# End Chunking Methods #############################

In [11]:
# Get features for each word in the sentence.
def getSentencefeatures(sentence):
    return [word2features_pos_chunk_case_digit(sentence, i) for i in range(len(sentence))]

# Get lables for each word in the sentence
def getlabels(sentence):
    return [label for token, postag, chunktag, label in sentence]

# Get tokens from the sentence
def sent2tokens(sentence):
    return [token for token, postag, chunktag, label in sentence]

In [12]:
# True Positives
# B-* => B-*
# I-* => I-*
#
# True Negatives
# O => O
#
# False Negatives
# B-* => O
# I-* => O
#
# False Positives
# O => B-*
# O => I-*
def compute_results(y_pred, y_test):
    tp = 0
    tn = 0
    fp = 0
    fn = 0

    for i in range(len(y_pred)):
        for j in range(len(y_test[i])):
            if y_test[i][j].startswith(('B-', 'I-')) and y_pred[i][j].startswith(('B-', 'I-')):
                tp += 1
            if y_test[i][j].startswith('O') and y_pred[i][j].startswith('O'):
                tn += 1
            if y_test[i][j].startswith(('B-', 'I-')) and y_pred[i][j].startswith('O') :
                fn += 1
            if y_test[i][j].startswith('O') and y_pred[i][j].startswith(('B-', 'I-')):
                fp += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    return precision, recall, accuracy

In [13]:
# Predicts on input test file based on a trained model.
# Specify model file name
def predict_on_test_set(filepath):
    sentence = []
    test_data = []
    count = 0
    with open(filepath) as chunking_data:
        for line in chunking_data.readlines():

            if 0 == len(line.strip()):
                test_data.append(sentence)
                sentence = []
                continue
            else:
                parts = line.split(' ')
                temp = (parts[0].strip(), parts[1].strip(), parts[2].strip(), parts[3].strip())

            sentence.append(temp)
            temp = ()

    # Get Test sentences with features.
    X_test = [getSentencefeatures(s) for s in test_data]

    # Get Test lables.
    y_test = [getlabels(s) for s in test_data]

    try:
        clf = joblib.load(model_name)
        y_pred = clf.predict(X_test)
        precision, recall, accuracy = compute_results(y_pred, y_test)
        print ('\n Model Name :', model_name.title(), '\nPrecision:', precision , '\nRecall: ', recall, '\nAccuracy:', accuracy)
        labels = list(clf.classes_)
        metrics.flat_f1_score(y_test, y_pred,
                              average='weighted', labels=labels)

        # Sort and Group labels
        sorted_labels = sorted(
            labels,
            key=lambda name: (name[1:], name[0])
        )

        print('\nResults from sklearn_crfsuite metrics \n')
        print(metrics.flat_classification_report(
            y_test, y_pred, labels=sorted_labels, digits=3
        ))
    except:
        print('There was an exception!',  sys.exc_info()[0])

In [14]:
# Train a new model based on training file or if there is a change in the feature set
def train_model():
    # Provide input file name
    trainfile = 'ner_dataset.txt'

    # Use 95% of the data for training (can be more or less)
    train_sentences, test_sentences = get_sentences_from_file(trainfile, 0.95)

    print('Number of training sentences : ', len(train_sentences))
    print('Number of test sentences : ', len(test_sentences))

    # Get Training sentence with features.
    X_train = [getSentencefeatures(s) for s in train_sentences]

    # Get Training lables
    y_train = [getlabels(s) for s in train_sentences]

    # Get Test sentences with features.
    X_test = [getSentencefeatures(s) for s in test_sentences]

    # Get Test lables
    y_test = [getlabels(s) for s in test_sentences]

    crf = sklearn_crfsuite.CRF(
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )

    crf.fit(X_train, y_train)

    joblib.dump(crf, new_model)
    
    print('\nModel created : ', new_model)
    
    labels = list(crf.classes_)

    y_pred = crf.predict(X_test)

    precision, recall, accuracy = compute_results(y_pred, y_test)

    print ('\nPrecision :', precision, '\nRecall : ', recall, '\nAccuracy : ', accuracy)

    metrics.flat_f1_score(y_test, y_pred,
                          average='weighted', labels=labels)

    # group B and I results
    sorted_labels = sorted(
        labels,
        key=lambda name: (name[1:], name[0])
    )

    print('\nResults from sklearn_crfsuite metrics \n')

    print(metrics.flat_classification_report(
        y_test, y_pred, labels=sorted_labels, digits=3
    ))

In [15]:
# Predicts on list of text.
# Returns Chunks from the sentence and and Predicted labels.
def predicts_on_text(text_list):

    # Generate Chunks based on the chunking crf model
    chunks, test_data, chunk_tags = generate_chunks(text_list)

    test_data = merge_with_chunk_tags(test_data, chunk_tags)

    # Get Test sentences with features for sentence.
    # X_test = [getSentencefeatures(s) for s in test_data]
    X_test = [getSentencefeatures(test_data)]

    try:
        clf = joblib.load(model_name)
        y_pred = clf.predict(X_test)

        print ('Chunks: \n')
        chunks = []
        chunk = ''
        tag_index = 0
        for i in range(len(y_pred[0])):
            if tag_index >= len(y_pred[0]):
                break
            # if y_pred[0][tag_index].startswith('O'):
            #     chunks[len(chunks)-1] = chunks[len(chunks)-1] + text_list[tag_index]
            #     break
            if y_pred[0][tag_index].startswith('B-'):
                chunk += text_list[tag_index] + ' '
                next_tag_index = tag_index + 1
                for j in range(len(y_pred[0])):
                    if next_tag_index >= len(y_pred[0]):
                        chunks.append(chunk.strip())
                        chunk = ''
                        tag_index = next_tag_index
                        break
                    if y_pred[0][next_tag_index].startswith('I-'):
                        chunk += text_list[next_tag_index] + ' '
                        next_tag_index = next_tag_index + 1
                    else:
                        chunks.append(chunk.strip())
                        chunk = ''
                        tag_index = next_tag_index
                        break
            else:
                tag_index += 1

        for c in chunks:
            print(c)

        return chunks, y_pred
    except 'AttributeError':
        print('There was an exception!', sys.exc_info()[0])

In [16]:
train_model()

Total number of sentences :  18453
Number of training sentences :  17531
Number of test sentences :  922

Model created :  ner_crf_model

Precision : 0.8971261974177426 
Recall :  0.8647129666800482 
Accuracy :  0.9626932413440654

Results from sklearn_crfsuite metrics 

              precision    recall  f1-score   support

           O      0.975     0.981     0.978     13163
       B-LOC      0.505     0.437     0.468       561
       I-LOC      0.295     0.210     0.245        62
      B-MISC      0.733     0.611     0.667       373
      I-MISC      0.457     0.372     0.410       129
       B-ORG      0.474     0.424     0.448       323
       I-ORG      0.317     0.526     0.395       196
       B-PER      0.653     0.665     0.659       493
       I-PER      0.759     0.729     0.744       354

   micro avg      0.912     0.912     0.912     15654
   macro avg      0.574     0.551     0.557     15654
weighted avg      0.911     0.912     0.911     15654



In [17]:
predict_on_test_set('ner_test_data.txt')


 Model Name : Ner_Crf_Model_Pos_Chunk_Case_Digit 
Precision: 0.886762360446571 
Recall:  0.8348348348348348 
Accuracy: 0.9632860040567951

Results from sklearn_crfsuite metrics 

              precision    recall  f1-score   support

           O      0.974     0.983     0.979      4264
       B-LOC      0.536     0.462     0.496       210
       I-LOC      0.267     0.211     0.235        19
      B-MISC      0.771     0.711     0.740       114
      I-MISC      0.588     0.417     0.488        24
       B-ORG      0.323     0.247     0.280        81
       I-ORG      0.400     0.494     0.442        77
       B-PER      0.535     0.552     0.544        96
       I-PER      0.604     0.711     0.653        45

   micro avg      0.918     0.918     0.918      4930
   macro avg      0.555     0.532     0.540      4930
weighted avg      0.915     0.918     0.916      4930



In [18]:
text_list = ['India', 'is','playing','in','the','world','cup','.']
predicts_on_text(text_list)

Chunks: 

India


(['India'], [['B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])

In [19]:
text_list = ['England', 'is','playing','in','the','world','cup','.']
predicts_on_text(text_list)

Chunks: 

England


(['England'], [['B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])

In [20]:
text_list = ['United', 'Kingdom', 'is','playing','in','the','world','cup','.']
predicts_on_text(text_list)

Chunks: 

United Kingdom


(['United Kingdom'], [['B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])

In [21]:
text_list = ['India', 'is','playing','in','the','world','cup','.']
predicts_on_text(text_list)

Chunks: 

India


(['India'], [['B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']])