In [1]:
#Simple feature map to feed arrays into the classifier. 
import numpy as np
import json
import pickle

In [2]:
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        prev_word = sent[i-1]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        next_word = sent[i+1]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent, tags):
    res = []
    for i in range(len(sent)):    
        if tags[i] != "O":
            features = word2features(sent,i)
            for x in range(len(tags[i])):
                res.append(features) 
        else:
            res.append(word2features(sent,i))
        
    return res

In [3]:
TAGS =  pickle.load(open( "tags.pickle", "rb" ))
TAGS.remove("O") 
NUM_TAGS = len(TAGS)

In [4]:
tag2id = {}
for id,label in enumerate(TAGS):
    tag2id[label] = id 

def label2id(labels):
    ret = []
    prev_label = ""
    for label in labels:
        if label == "O":
            ret.append(str(2*NUM_TAGS))
        elif label == prev_label:
            l = [tag2id[t]+ NUM_TAGS for t in label]
            for x in l:
                ret.append(str(x))
        else:
            l =[tag2id[t] for t in label]
            for x in l:
                ret.append(str(x))
        prev_label = label
    return ret 


In [5]:
def get_label(label_id):
    if label_id == (2*NUM_TAGS):
        return "O"
    elif label_id >= NUM_TAGS:
        return [TAGS[label_id-NUM_TAGS]]
    else:
        return TAGS[label_id]


In [6]:
def id2label(labels):
    ret = []
    for label in labels:
        l = [get_label(x) for x in label]
        if len(l) == 1 and l[0] == "O":
            l = "O"
        ret.append(l)
    return ret 

In [7]:
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import hamming_loss

In [8]:
# load train data
with open('../../data/dev.json') as f:
    train_data = json.load(f)
    
# transform y_id to boolean vector 
def to_bool_vec(y_id):
    y_bool = np.zeros((2*NUM_TAGS+1))
    for id in y_id:
        for l in label:
            y_bool[id] = 1
    return y_bool
    
X_train = [sent2features(sent['sent'], sent['tags']) for sent in train_data]
y_train = [label2id(sent['tags']) for sent in train_data]

print([x["word.lower()"] for x in X_train[0]])
print("-----------------------")
print(y_train[0])

['he', 'taught', 'at', 'the', 'universities', 'of', 'aberdeen', ',', 'liverpool', ',', 'sheffield', 'and', 'manchester', 'manchester', 'manchester', ',', 'and', 'entered', 'the', 'indian', 'education', 'service', 'in', '1912', '.']
-----------------------
['226', '226', '226', '226', '226', '226', '226', '226', '226', '226', '226', '226', '53', '4', '79', '226', '226', '226', '226', '226', '226', '226', '226', '226', '226']


In [9]:
def hamming(y_true, y_pred):
    """
    Computes the average Hamming loss between two binary vectors.
    """
    to_bool_vec = lambda y: np.array([1 if i in y else 0 for i in range(2*NUM_TAGS+1)])
    y_t = to_bool_vec(y_true)
    y_p = to_bool_vec(y_pred)
    
    return hamming_loss(y_true, y_pred)

def total_hamming_loss(y_true, y_pred):
    loss = 0
    for i in range(len(y_true)):
        loss += hamming(y_true[i], y_pred[i])
    return -loss / len(y_true)

In [11]:
# Train Model
#crf model using hamming loss defined above
crf_model = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=1, all_possible_transitions=False)
crf_model.fit(X_train, y_train, X_dev=None, y_dev=None)

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

In [None]:
# # Make predictions on test data
# y_pred = crf.predict(X_test)

# # Print classification report
# labels = list(set(tag for sent in y_test for tag in sent))
# print(flat_classification_report(y_test, y_pred, labels=labels))