# NER Tagger
## subsection of _Semantic Analysis_

* Building an NER Tagger from Scratch
* Building an End-to-End NER Tagger with Our Trained NER Model
* Analyzing Semantic Representations
    1. Propositional Logic
    2. First Order Logic

In [None]:
import pandas as pd

dataset_path = '/data/ner_dataset.csv.gz' # need to import

df = pd.read_csv(dataset_path, compression='gzip', encoding='ISO-8859-1')
df = df.fillna(method='ffill')
df.info()

In [None]:
df.T

In [None]:
df['Sentence #'].nunique(), df.Word.nunique(), df.POS.nunique(), df.Tag.nunique()

In [None]:
df.Tag.value_counts()

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

# convert input sentence into features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# get corresponding outcome NER tag label for input sentence
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [None]:
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                   s['POS'].values.tolist(), 
                                                   s['Tag'].values.tolist())]

In [None]:
grouped_df = df.groupby('Sentence #').apply(agg_func)

sentences = [s for s in grouped_df]

In [None]:
# view sample annotated sentence from our dataset
sentences[0]

In [None]:
# view how each annotated tokenized sentence can be used for feat engineering w/ earlier defined fxn
sent2features(sentences[0][5:7])

In [None]:
sent2labels(sentences[0][5:7])

In [None]:
# prepared train and test datasets by feat engineering on input sentences
# getting corresponding NER tag labels
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([sent2features(s) for s in sentences])
y = np.array([sent2labels(s) for s in sentences])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

In [None]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(algorithm='lbfgs', 
                           c1=0.1, 
                           c2=0.1, 
                           max_iterations=100, 
                           all_possible_transitions=True, 
                           verbose=True)
# crf.fit(X_train, y_train)

In [None]:
# save model using following code
from sklearn.externals import joblib
joblib.dump(crf, 'ner_model.pkl')

# to load
# crf = joblib.load('ner_model.pkl')

In [None]:
# evaluate model performance for NER tagging on test data
# show sample prediction and actual labels
y_pred = crf.predict(X_test)
print(y_pred[0])

In [None]:
print(y_test[0])

In [None]:
# evaluate model performance on entire test dataset
# get key classification model performance metrics
from sklearn_crfsuite import metrics as crf_metrics

labels = list(crf.classes_)
labels.remove('O')
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels))

# Building an End-to-End NER Tagger with Our Trained NER Model

In [None]:
# tokenize our text and perform POS tagging
import nltk

text_tokens = nltk.word_tokenize(text)
text_pos = nltk.pos_tag(text_tokens)
text_pos[:10]

In [None]:
# extract features from POS tagged text document
features = [sent2features(text_pos)]
features[0][0]

In [None]:
# use CRF model just trained to predict features we engineered from sample doc
labels = crf.predict(features)
doc_labels = labels[0]
doc_labels[10:20]

In [None]:
# combo actual text tokens with corresponding NER tags
# retrieve relevant named entities from NER tags
text_ner = [(token, tag) for token, tag in zip(text_tokens, doc_labels)]
print(text_ner)

In [None]:
# extract and display all named entities
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in text_ner:
    if tag != 'O':
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None

import pandas as pd
pd.DataFrame(named_entities, columns=['Entity', 'Tag'])