In [3]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import numpy as np
import sys
import xml.etree.ElementTree as ET
import pycrfsuite

In [4]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [6]:
print(bio_classification_report([[1],[0]],[[1],[1]]))

AttributeError: 'numpy.int32' object has no attribute 'split'

In [3]:
def get_labels(xml_path):
    gt = {}
    tree = ET.parse(xml_path)
    root = tree.getroot()
    for table in root:
        for region in table:
            for cell in region:
                for child in cell:
                    if child.tag == 'content':
                        if child.text not in gt:
                            gt[child.text] = 0
                        gt[child.text] += 1
    return gt                   

In [4]:
def get_features_and_labels(gt_xml,xml_path):
    gt = get_labels(gt_xml)

    id2text = []
    id2pos = []
    features = []
    labels = []

    root = ET.parse(xml_path).getroot()
    for page in root:
        for text in page:
            id2text.append(text.text)
            id2pos.append(text.attrib)
            sentence = text.text.strip()
            if sentence in gt:
                labels.append("1")
            else:
                labels.append("0")

    for idx in range(len(id2text)):
        dist_from_up = 0.0
        if idx > 0:
            dist_from_up = int(id2pos[idx]["top"])-int(id2pos[idx-1]["top"])+int(id2pos[idx-1]["width"])

        dist_from_down = 0.0
        if idx < len(id2text)-1:
            dist_from_down = int(id2pos[idx+1]["top"])-int(id2pos[idx]["top"])+int(id2pos[idx]["width"])

        features_idx = [
            'bias',
            '-1:distance=%s' % dist_from_up,
            '+1:distance=%s' % dist_from_down, 
        ]

        features.append(features_idx)
    
    return features, labels


def train_test(train_xml,train_pyxml,test_xml,test_pyxml):
    features, labels = get_features_and_labels(train_xml,train_pyxml)
    #print(len(features),len(labels))
    for i in range(len(features)):
        features[i] = [features[i]]
    for j in range(len(labels)):

        labels[j] = [labels[j]]
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(features, labels):
        trainer.append(xseq, yseq)
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    trainer.params()
    trainer.train('exago-model.crfsuite')
    print(trainer.logparser.last_iteration)
    test_features,test_labels = get_features_and_labels(test_xml,test_pyxml)
    for i in range(len(test_features)):
        test_features[i] = [test_features[i]]
    for j in range(len(test_labels)):
        test_labels[j] = [test_labels[j]]
    test_tagger = pycrfsuite.Tagger()
    test_tagger.open('exago-model.crfsuite')
    y_pred = [test_tagger.tag(xseq) for xseq in test_features]
    print(bio_classification_report(test_labels, y_pred))

    
    
    
    
    
    

In [5]:
train_test('eu-001-str.xml','pypdf2_eu-002.xml','eu-006-str.xml','pypdf2_eu-006.xml')


None
              precision    recall  f1-score   support

           0       0.37      1.00      0.54        61
           1       0.00      0.00      0.00       103

   micro avg       0.37      0.37      0.37       164
   macro avg       0.19      0.50      0.27       164
weighted avg       0.14      0.37      0.20       164



  'precision', 'predicted', average, warn_for)


In [6]:
temp_feat, temp_labels = get_features_and_labels('eu-001-str.xml','pypdf2_eu-001.xml')

In [7]:
for i in range(len(temp_feat)):
    temp_feat[i] = [temp_feat[i]]

In [8]:
temp_feat[:1]

[[['bias', '-1:distance=0.0', '+1:distance=318']]]

In [9]:
for i in range(len(temp_labels)):
    temp_labels[i] = [temp_labels[i]]

In [10]:
temp_labels[:1]

[['0']]

In [11]:
temp_trainer = pycrfsuite.Trainer(verbose=False)


In [12]:
for xseq, yseq in zip(temp_feat, temp_labels):
        temp_trainer.append(xseq, yseq)


In [13]:
temp_trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [14]:
temp_trainer.train('test.crfsuite')

In [15]:
print(len(temp_trainer.logparser.iterations), temp_trainer.logparser.iterations[-1])

39 {'num': 39, 'scores': {}, 'loss': 160.656843, 'feature_norm': 9.943755, 'error_norm': 0.00894, 'active_features': 61, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.0}


In [16]:
temp_tagger = pycrfsuite.Tagger()
temp_tagger.open('test.crfsuite')

<contextlib.closing at 0x26cf2c19b70>

In [17]:
test_feat, test_labels = get_features_and_labels('eu-006-str.xml','pypdf2_eu-006.xml')

In [18]:
for i in range(len(test_feat)):
    test_feat[i] = [test_feat[i]]

In [19]:
for i in range(len(test_labels)):
    test_labels[i] = [test_labels[i]]

In [20]:
y_pred = [temp_tagger.tag(xseq) for xseq in test_feat]
accu=0
for i in range(len(y_pred)):
    if(y_pred[i] == test_labels[i]):
        accu+=1
print(accu/len(y_pred))

0.5548780487804879


In [21]:
y_pred = [temp_tagger.tag(xseq) for xseq in test_feat]
test_labels,'-------------\n',y_pred

([['0'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['1'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['1'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
  ['0'],
 

In [22]:
print(bio_classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        61
           1       0.60      0.88      0.71       103

   micro avg       0.55      0.55      0.55       164
   macro avg       0.30      0.44      0.36       164
weighted avg       0.38      0.55      0.45       164



In [23]:
['0','0'] == ['0','0']

True

In [24]:
gt_xml_list = []
pypdf_xml_list = []
for i in range(1,28):
    if(i<10):
        strname = "eu-00%d-str.xml"%(i)
        gt_xml_list.append(strname)
        strname = "pypdf2_eu-00%d.xml"%(i)
        pypdf_xml_list.append(strname)
    else:
        strname = "eu-0%d-str.xml"%(i)
        gt_xml_list.append(strname)
        strname = "pypdf2_eu-0%d.xml"%(i)
        pypdf_xml_list.append(strname)
gt_xml_list[8] = "eu-009a-str.xml"
pypdf_xml_list[8] ="pypdf2_eu-009a.xml"
gt_xml_list_us = []
pypdf_xml_list_us = []
for i in range(1,41):
    if(i<10):
        strname = "us-00%d-str.xml"%(i)
        gt_xml_list_us.append(strname)
        strname = "pypdf2_us-00%d.xml"%(i)
        pypdf_xml_list_us.append(strname)
    else:
        strname = "us-0%d-str.xml"%(i)
        gt_xml_list_us.append(strname)
        strname = "pypdf2_us-0%d.xml"%(i)
        pypdf_xml_list_us.append(strname)
gt_xml_list_us[10],pypdf_xml_list_us[10] = "us-011a-str.xml","pypdf2_us-011a.xml"
gt_xml_list_us[30],pypdf_xml_list_us[30] = "us-031a-str.xml","pypdf2_us-031a.xml"
gt_xml_list_us[34],pypdf_xml_list_us[34] = "us-035a-str.xml","pypdf2_us-035a.xml"

gt_xml_list= gt_xml_list+gt_xml_list_us
pypdf_xml_list = pypdf_xml_list+pypdf_xml_list_us
gt_xml_list,pypdf_xml_list 

(['eu-001-str.xml',
  'eu-002-str.xml',
  'eu-003-str.xml',
  'eu-004-str.xml',
  'eu-005-str.xml',
  'eu-006-str.xml',
  'eu-007-str.xml',
  'eu-008-str.xml',
  'eu-009a-str.xml',
  'eu-010-str.xml',
  'eu-011-str.xml',
  'eu-012-str.xml',
  'eu-013-str.xml',
  'eu-014-str.xml',
  'eu-015-str.xml',
  'eu-016-str.xml',
  'eu-017-str.xml',
  'eu-018-str.xml',
  'eu-019-str.xml',
  'eu-020-str.xml',
  'eu-021-str.xml',
  'eu-022-str.xml',
  'eu-023-str.xml',
  'eu-024-str.xml',
  'eu-025-str.xml',
  'eu-026-str.xml',
  'eu-027-str.xml',
  'us-001-str.xml',
  'us-002-str.xml',
  'us-003-str.xml',
  'us-004-str.xml',
  'us-005-str.xml',
  'us-006-str.xml',
  'us-007-str.xml',
  'us-008-str.xml',
  'us-009-str.xml',
  'us-010-str.xml',
  'us-011a-str.xml',
  'us-012-str.xml',
  'us-013-str.xml',
  'us-014-str.xml',
  'us-015-str.xml',
  'us-016-str.xml',
  'us-017-str.xml',
  'us-018-str.xml',
  'us-019-str.xml',
  'us-020-str.xml',
  'us-021-str.xml',
  'us-022-str.xml',
  'us-023-str.xml'

In [25]:
def get_features_and_labels(gt_xml,xml_path):
    gt = get_labels(gt_xml)

    id2text = []
    id2pos = []
    features = []
    labels = []

    root = ET.parse(xml_path).getroot()
    for page in root:
        for text in page:
            id2text.append(text.text)
            id2pos.append(text.attrib)
            sentence = text.text.strip()
            if sentence in gt:
                labels.append("1")
            else:
                labels.append("0")

    for idx in range(len(id2text)):
        dist_from_up = 0.0
        if idx > 0:
            dist_from_up = int(id2pos[idx]["top"])-int(id2pos[idx-1]["top"])+int(id2pos[idx-1]["width"])

        dist_from_down = 0.0
        if idx < len(id2text)-1:
            dist_from_down = int(id2pos[idx+1]["top"])-int(id2pos[idx]["top"])+int(id2pos[idx]["width"])

        features_idx = [
            'bias',
            '-1:distance=%s' % dist_from_up,
            '+1:distance=%s' % dist_from_down, 
        ]

        features.append(features_idx)
    
    return features, labels


def train_test(train_xml_list,train_pyxml_list,test_xml_list,test_pyxml_list):
    trainer = pycrfsuite.Trainer(verbose=False)
    for i in range(len(train_xml_list)):
        
        features, labels = get_features_and_labels(train_xml_list[i],train_pyxml_list[i])
        
    #print(len(features),len(labels))
        for k in range(len(features)):
            features[k] = [features[k]]
        for j in range(len(labels)):
            labels[j] = [labels[j]]
            
        for xseq, yseq in zip(features, labels):
            trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.params()

    trainer.train('exago-model.crfsuite')
    
    print(trainer.logparser.last_iteration)
    
    
    
    for i in range(len(test_xml_list)):
        test_features,test_labels = get_features_and_labels(test_xml_list[i],test_pyxml_list[i])
        for k in range(len(test_features)):
            test_features[k] = [test_features[k]]
        for j in range(len(test_labels)):
            test_labels[j] = [test_labels[j]]
        test_tagger = pycrfsuite.Tagger()
        test_tagger.open('exago-model.crfsuite')
        y_pred = [test_tagger.tag(xseq) for xseq in test_features]
        print(test_xml_list[i]," ","################################")
        print(bio_classification_report(test_labels, y_pred))

    
    
    
    
    
    

In [26]:
train_test(gt_xml_list[8:26],pypdf_xml_list[8:26],gt_xml_list[:8],pypdf_xml_list[:8])

{'num': 50, 'scores': {}, 'loss': 1791.690141, 'feature_norm': 14.758593, 'error_norm': 0.485855, 'active_features': 333, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.0}
eu-001-str.xml   ################################
              precision    recall  f1-score   support

           0       0.25      0.52      0.34       130
           1       0.71      0.43      0.54       357

   micro avg       0.46      0.46      0.46       487
   macro avg       0.48      0.47      0.44       487
weighted avg       0.59      0.46      0.48       487

eu-002-str.xml   ################################
              precision    recall  f1-score   support

           0       0.93      0.83      0.88        30
           1       0.86      0.94      0.90        33

   micro avg       0.89      0.89      0.89        63
   macro avg       0.89      0.89      0.89        63
weighted avg       0.89      0.89      0.89        63

eu-003-str.xml   ################################
             

In [71]:
train_test(gt_xml_list[36:63],pypdf_xml_list[36:63],gt_xml_list[35:36],pypdf_xml_list[35:36])

{'num': 50, 'scores': {}, 'loss': 2831.835652, 'feature_norm': 24.605278, 'error_norm': 4.98475, 'active_features': 471, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.004}
              precision    recall  f1-score   support

           0       0.72      0.71      0.72       140
           1       0.23      0.24      0.24        50

   micro avg       0.59      0.59      0.59       190
   macro avg       0.48      0.48      0.48       190
weighted avg       0.59      0.59      0.59       190



In [27]:
train_test(gt_xml_list[35:63],pypdf_xml_list[35:63],gt_xml_list[27:34],pypdf_xml_list[27:34])

{'num': 50, 'scores': {}, 'loss': 2966.847498, 'feature_norm': 24.267463, 'error_norm': 6.571807, 'active_features': 467, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.002}
us-001-str.xml   ################################
              precision    recall  f1-score   support

           0       0.80      0.81      0.80       322
           1       0.83      0.83      0.83       374

   micro avg       0.82      0.82      0.82       696
   macro avg       0.82      0.82      0.82       696
weighted avg       0.82      0.82      0.82       696

us-002-str.xml   ################################
              precision    recall  f1-score   support

           0       0.54      0.61      0.57       155
           1       0.84      0.80      0.82       398

   micro avg       0.75      0.75      0.75       553
   macro avg       0.69      0.70      0.69       553
weighted avg       0.76      0.75      0.75       553

us-003-str.xml   ################################
           