In [1]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# conditionalRandomFields1.ipynb
#
# using only BERT's token features
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# April, 2019
# --------------------------------------------------

In [2]:
from seqeval.metrics import classification_report
from sklearn_crfsuite import CRF
import scipy.io as sio
from tqdm import tqdm
import numpy as np
import pickle

In [3]:
def readFile(filepath):
    
        text     = []
        sentence = []

        with open(filepath, 'r') as infile:
            for line in infile:
                word, _, _, _ = line.strip().split('\t')

                if word == '<S>':
                    text.append(sentence)
                    sentence = []
                    continue

                sentence.append(line.strip())

        return text

In [4]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt') 

In [5]:
features  = sio.loadmat('data/features.mat')

In [None]:
trainFeatures = []
trainLabels   = []

feats         = []
labels        = []
idx = 0
for sentence in tqdm(trainText):
    for token in sentence:
        _, _, _, label = token.split('\t')
        d = dict(enumerate(features['trainTokenFeatures'][idx, :]))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        idx += 1
        labels.append(label)
        
    trainFeatures.append(feats)
    trainLabels.append(labels)
    feats, labels = [], []

In [None]:
validFeatures = []
validLabels   = []

feats         = []
labels        = []
idx = 0
for sentence in tqdm(validText):
    for token in sentence:
        _, _, _, label = token.split('\t')
        d = dict(enumerate(features['validTokenFeatures'][idx, :]))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        idx += 1
        labels.append(label)
        
    validFeatures.append(feats)
    validLabels.append(labels)
    feats, labels = [], []

In [None]:
testFeatures = []
testLabels   = []

feats         = []
labels        = []
idx = 0
for sentence in tqdm(testText):
    for token in sentence:
        _, _, _, label = token.split('\t')
        d = dict(enumerate(features['testTokenFeatures'][idx, :]))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        idx += 1
        labels.append(label)
        
    testFeatures.append(feats)
    testLabels.append(labels)
    feats, labels = [], []

In [16]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels   + validLabels

In [4]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True)

In [None]:
crf.fit(trainvalFeatures, trainvalLabels)

In [None]:
# SAVE CONDITIONAL RANDOM FIELDS MODEL
with open('model/conditional_random_fields1.pickle', 'wb') as outfile:
    pickle.dump(crf, outfile, pickle.HIGHEST_PROTOCOL)
    print("Conditional Random Fields model is saved.")

In [13]:
# LOAD CONDITIONAL RANDOM FIELDS MODEL
with open('model/conditional_random_fields1.pickle', 'rb') as infile:
    crf = pickle.load(infile)

In [None]:
trainPredLabels = crf.predict(trainFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainLabels, trainPredLabels))

In [None]:
validPredLabels = crf.predict(validFeatures)

print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(validLabels, validPredLabels))

In [14]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

ORGANIZATION       0.58      0.29      0.39       862
    LOCATION       0.74      0.54      0.62      1091
        DATE       0.79      0.63      0.70       364
      PERSON       0.61      0.34      0.43      1594
     PERCENT       1.00      0.94      0.97       107
        TIME       0.87      0.87      0.87        23
       MONEY       0.86      0.66      0.75       113

   micro avg       0.69      0.43      0.53      4154
   macro avg       0.67      0.43      0.52      4154

