In [24]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# conditionalRandomFields1.ipynb
#
# using only BERT token features
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# April, 2019
# --------------------------------------------------

In [14]:
from seqeval.metrics import classification_report
from sklearn_crfsuite import CRF
import scipy.io as sio
from tqdm import tqdm
import numpy as np
import pickle

In [3]:
def readFile(filepath):
    
        text     = []
        sentence = []

        with open(filepath, 'r') as infile:
            for line in infile:
                word, _, _, _ = line.strip().split('\t')

                if word == '<S>':
                    text.append(sentence)
                    sentence = []
                    continue

                sentence.append(line.strip())

        return text

In [4]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt') 

In [5]:
features  = sio.loadmat('data/features.mat')

In [6]:
trainFeatures = []
trainLabels   = []

feats         = []
labels        = []
idx = 0
for sentence in tqdm(trainText):
    for token in sentence:
        _, _, _, label = token.split('\t')
        d = dict(enumerate(features['trainTokenFeatures'][idx, :]))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        idx += 1
        labels.append(label)
        
    trainFeatures.append(feats)
    trainLabels.append(labels)
    feats, labels = [], []

100%|██████████| 25736/25736 [04:40<00:00, 91.72it/s] 


In [7]:
validFeatures = []
validLabels   = []

feats         = []
labels        = []
idx = 0
for sentence in tqdm(validText):
    for token in sentence:
        _, _, _, label = token.split('\t')
        d = dict(enumerate(features['validTokenFeatures'][idx, :]))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        idx += 1
        labels.append(label)
        
    validFeatures.append(feats)
    validLabels.append(labels)
    feats, labels = [], []

100%|██████████| 6435/6435 [01:51<00:00, 57.57it/s]


In [8]:
testFeatures = []
testLabels   = []

feats         = []
labels        = []
idx = 0
for sentence in tqdm(testText):
    for token in sentence:
        _, _, _, label = token.split('\t')
        d = dict(enumerate(features['testTokenFeatures'][idx, :]))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        idx += 1
        labels.append(label)
        
    testFeatures.append(feats)
    testLabels.append(labels)
    feats, labels = [], []

100%|██████████| 3328/3328 [00:38<00:00, 85.69it/s] 


In [16]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels   + validLabels

In [17]:
crf.fit(trainvalFeatures, trainvalLabels)

loading training data to CRFsuite: 100%|██████████| 32171/32171 [11:37<00:00, 46.11it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5720
Seconds required: 57.927

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=42.28 loss=482055.99 active=5720  feature_norm=1.00
Iter 2   time=22.55 loss=410786.36 active=5504  feature_norm=0.88
Iter 3   time=95.79 loss=300130.29 active=4453  feature_norm=0.52
Iter 4   time=23.49 loss=298583.42 active=5517  feature_norm=0.71
Iter 5   time=25.11 loss=269510.08 active=5541  feature_norm=0.65
Iter 6   time=25.37 loss=263547.12 active=5065  feature_norm=0.65
Iter 7   time=24.53 loss=257666.52 active=5574  feature_norm=0.67
Iter 8   time=23.92 loss=253458.70 active=5480  feature_norm=0.72
Iter 9   time=23.56 loss=239491.47 active=5479  feature_norm=0.96
Iter 

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

In [18]:
# SAVE CONDITIONAL RANDOM FIELDS MODEL
with open('model/conditional_random_fields1.pickle', 'wb') as outfile:
    pickle.dump(crf, outfile, pickle.HIGHEST_PROTOCOL)
    print("Conditional Random Fields model is saved.")

Conditional Random Fields model is saved.


In [19]:
# LOAD CONDITIONAL RANDOM FIELDS MODEL
with open('model/conditional_random_fields1.pickle', 'rb') as infile:
    crf = pickle.load(infile)

In [20]:
trainPredLabels = crf.predict(trainFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainPredLabels, trainLabels))

### TRAIN CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

        DATE       0.70      0.83      0.76      2145
      PERSON       0.31      0.57      0.40      5929
    LOCATION       0.54      0.73      0.62      5789
       MONEY       0.73      0.87      0.79       405
ORGANIZATION       0.34      0.61      0.44      3924
        TIME       0.97      0.98      0.97       154
     PERCENT       0.99      0.98      0.98       527

   micro avg       0.43      0.68      0.53     18873
   macro avg       0.46      0.68      0.55     18873



In [21]:
validPredLabels = crf.predict(validFeatures)

print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(validPredLabels, validLabels))

### VAL CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

ORGANIZATION       0.31      0.61      0.42      1005
        DATE       0.71      0.81      0.76       480
    LOCATION       0.57      0.69      0.63      1360
      PERSON       0.38      0.66      0.48      1958
     PERCENT       0.98      1.00      0.99        94
       MONEY       0.79      0.88      0.83        99
        TIME       0.95      1.00      0.97        18

   micro avg       0.44      0.69      0.54      5014
   macro avg       0.47      0.69      0.55      5014



In [22]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testPredLabels, testLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.34      0.61      0.43       874
ORGANIZATION       0.29      0.58      0.39       424
    LOCATION       0.54      0.74      0.62       789
        DATE       0.63      0.79      0.70       290
     PERCENT       0.94      1.00      0.97       101
       MONEY       0.66      0.86      0.75        87
        TIME       0.87      0.87      0.87        23

   micro avg       0.43      0.69      0.53      2588
   macro avg       0.46      0.69      0.55      2588

