In [1]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# conditionalRandomFields2.ipynb
#
# using BERT token and sentence features
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# April, 2019
# --------------------------------------------------

In [2]:
from seqeval.metrics import classification_report
from sklearn_crfsuite import CRF
import scipy.io as sio
from tqdm import tqdm
import numpy as np
import pickle

In [3]:
def readFile(filepath):
    
        text     = []
        sentence = []

        with open(filepath, 'r') as infile:
            for line in infile:
                word, _, _, _ = line.strip().split('\t')

                if word == '<S>':
                    text.append(sentence)
                    sentence = []
                    continue

                sentence.append(line.strip())

        return text

In [4]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt') 

In [5]:
features  = sio.loadmat('data/features.mat')

In [None]:
trainFeatures = []
trainLabels   = []

feats         = []
labels        = []

tokenIdx      = 0
sentenceIdx   = 0

for sentence in tqdm(trainText):
    
    for token in sentence:
        _, _, _, label = token.split('\t')
        feat = np.concatenate((features['trainTokenFeatures'][tokenIdx], features['trainSentFeatures'][sentenceIdx]))
        d = dict(enumerate(feat))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        tokenIdx += 1
        labels.append(label)
        
    trainFeatures.append(feats)
    trainLabels.append(labels)
    feats, labels = [], []
    sentenceIdx += 1

 93%|█████████▎| 23983/25736 [11:18<00:49, 35.32it/s]

In [None]:
validFeatures = []
validLabels   = []

feats         = []
labels        = []

tokenIdx      = 0
sentenceIdx   = 0

for sentence in tqdm(validText):
    
    for token in sentence:
        _, _, _, label = token.split('\t')
        feat = np.concatenate((features['validTokenFeatures'][tokenIdx], features['validSentFeatures'][sentenceIdx]))
        d = dict(enumerate(feat))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        tokenIdx += 1
        labels.append(label)
        
    validFeatures.append(feats)
    validLabels.append(labels)
    feats, labels = [], []
    sentenceIdx += 1

In [None]:
testFeatures = []
testLabels   = []

feats         = []
labels        = []

tokenIdx      = 0
sentenceIdx   = 0

for sentence in tqdm(testText):
    
    for token in sentence:
        _, _, _, label = token.split('\t')
        feat = np.concatenate((features['testTokenFeatures'][tokenIdx], features['testSentFeatures'][sentenceIdx]))
        d = dict(enumerate(feat))
        d = {str(k): v for k, v in d.items()}
        feats.append(d)
        tokenIdx += 1
        labels.append(label)
        
    testFeatures.append(feats)
    testLabels.append(labels)
    feats, labels = [], []
    sentenceIdx += 1

In [None]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels   + validLabels

In [None]:
crf.fit(trainvalFeatures, trainvalLabels)

In [None]:
# SAVE CONDITIONAL RANDOM FIELDS MODEL
with open('model/conditional_random_fields2.pickle', 'wb') as outfile:
    pickle.dump(crf, outfile, pickle.HIGHEST_PROTOCOL)
    print("Conditional Random Fields model is saved.")

In [None]:
# LOAD CONDITIONAL RANDOM FIELDS MODEL
with open('model/conditional_random_fields2.pickle', 'rb') as infile:
    crf = pickle.load(infile)

In [None]:
trainPredLabels = crf.predict(trainFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainPredLabels, trainLabels))

In [None]:
validPredLabels = crf.predict(validFeatures)

print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(validPredLabels, validLabels))

In [None]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testPredLabels, testLabels))