# Acquisitor and Cleaner

Download the dataset, store in marvin_initial_dataset.

In [1]:
import nltk
import os

In [2]:
nltk.download('conll2002')
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

marvin_initial_dataset = {
    'train_sents': train_sents,
    'test_sents': test_sents
}

[nltk_data] Downloading package conll2002 to /home/zhang/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [3]:
print(train_sents[0:2])

[[(u'Melbourne', u'NP', u'B-LOC'), (u'(', u'Fpa', u'O'), (u'Australia', u'NP', u'B-LOC'), (u')', u'Fpt', u'O'), (u',', u'Fc', u'O'), (u'25', u'Z', u'O'), (u'may', u'NC', u'O'), (u'(', u'Fpa', u'O'), (u'EFE', u'NC', u'B-ORG'), (u')', u'Fpt', u'O'), (u'.', u'Fp', u'O')], [(u'-', u'Fg', u'O')]]


# Training Preparator

Feature engineering, the initial datasets are splitted into feature datasets and label datasets.

In [4]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

X_train = [sent2features(s) for s in marvin_initial_dataset['train_sents']]
y_train = [sent2labels(s) for s in marvin_initial_dataset['train_sents']]

X_test = [sent2features(s) for s in marvin_initial_dataset['test_sents']]
y_test = [sent2labels(s) for s in marvin_initial_dataset['test_sents']]

marvin_dataset = {
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

# Trainer

Model training.

In [5]:
import sklearn_crfsuite

In [6]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.10789964607864502, 
    c2=0.082422264927260847, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])

marvin_model = {
    'crf': crf
}

# Metrics Evaluator

Creating evaluation metrics for trained model.

In [7]:
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [8]:
labels = list(marvin_model['crf'].classes_)
labels.remove('O')
y_pred = marvin_model['crf'].predict(marvin_dataset['X_test'])

score = metrics.flat_f1_score(marvin_dataset['y_test'], y_pred, average='weighted', labels=labels)

sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
report = metrics.flat_classification_report(
    marvin_dataset['y_test'], y_pred, labels=sorted_labels, digits=3
)

marvin_metrics = {
    'score': score,
    'report': report
}

print('Balanced F-score: ' + str(score))
print('\nClassification Report: \n' + str(report))

Balanced F-score: 0.797607625209

Classification Report: 
             precision    recall  f1-score   support

      B-LOC      0.806     0.784     0.795      1084
      I-LOC      0.697     0.631     0.662       325
     B-MISC      0.749     0.555     0.637       339
     I-MISC      0.743     0.582     0.653       557
      B-ORG      0.807     0.835     0.821      1400
      I-ORG      0.841     0.800     0.820      1104
      B-PER      0.845     0.887     0.865       735
      I-PER      0.894     0.940     0.916       634

avg / total      0.812     0.788     0.798      6178



# Prediction Preparator

Applying feature engineering method in input_message, preparing it for prediction.

In [9]:
input_message = [(u'Melbourne', u'NP', u'B-LOC'),
 (u'(', u'Fpa', u'O'),
 (u'Australia', u'NP', u'B-LOC'),
 (u')', u'Fpt', u'O'),
 (u',', u'Fc', u'O'),
 (u'25', u'Z', u'O'),
 (u'may', u'NC', u'O'),
 (u'(', u'Fpa', u'O'),
 (u'EFE', u'NC', u'B-ORG'),
 (u')', u'Fpt', u'O'),
 (u'.', u'Fp', u'O')]

In [10]:
input_label = sent2labels(input_message)
print(input_label)

[u'B-LOC', u'O', u'B-LOC', u'O', u'O', u'O', u'O', u'O', u'B-ORG', u'O', u'O']


Feature engineering methods is implemented again here in Prediction Preparator, because in Marvin code structure, each action is separated.

In [11]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

input_message = sent2features(input_message)
print(input_message[0])

{'+1:word.isupper()': False, 'word.isupper()': False, 'BOS': True, 'word[-2:]': u'ne', '+1:postag': u'Fpa', 'word.isdigit()': False, 'postag': u'NP', 'bias': 1.0, 'postag[:2]': u'NP', '+1:word.lower()': u'(', '+1:word.istitle()': False, 'word.istitle()': True, 'word.lower()': u'melbourne', 'word[-3:]': u'rne', '+1:postag[:2]': u'Fp'}


In [12]:
sentence = []
entities = {}
        
for i, token in enumerate(input_message):
    word = token["word.lower()"]
    sentence.append(word)
            
    label = marvin_model['crf'].predict(input_message)[0][i]
    if label != "O":
        if label in entities:
            entities[label].append(word)
        else:
            entities[label] = [word]
example_of_prediction = {}
example_of_prediction["sentence"] = ' '.join(sentence)
example_of_prediction["entities_found"] = {}
for k, v in entities.items():
    example_of_prediction["entities_found"][k] = ' '.join(v)

print(example_of_prediction)

{'entities_found': {'B-ORG': u'melbourne', 'I-ORG': u'( australia ) , 25 may ( efe ) .'}, 'sentence': u'melbourne ( australia ) , 25 may ( efe ) .'}


# Predictor

Do prediction.

In [13]:
final_prediction = marvin_model['crf'].predict(input_message)

In [14]:
print(final_prediction)

[['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'