<a href="https://colab.research.google.com/github/agiagoulas/page-stream-segmentation/blob/master/model_training/BERT_Model_SinglePage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch

In [None]:
from keras.models import load_model, Model
from sklearn import metrics as sklm
import numpy as np
import keras.backend as K
import tensorflow as tf
import requests

In [None]:
def read_csv_data(csvfile):
    texts = []
    labels = []
    doc_names = []
    with open(csvfile, 'r', encoding='UTF-8') as f:
        datareader = csv.reader(f, delimiter=';', quotechar='"')
        next(datareader)
        for counter, csv_row in enumerate(datareader):
            texts.append(csv_row[1])
            labels.append(1 if csv_row[2] == "FirstPage" else 0)
            doc_names.append(csv_row[3])
        return texts, labels, doc_names

In [None]:
working_dir = "/sample_dir/"

In [None]:
train_texts, train_labels, train_names = read_csv_data(working_dir + "/tobacco800.train")
test_texts, test_labels, test_names = read_csv_data(working_dir + "tobacco800.test")

# BERT Training

In [None]:
class Tobacco800Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
train_dataset = Tobacco800Dataset(train_encodings, train_labels)
test_dataset = Tobacco800Dataset(test_encodings, test_labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=20,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',   
    logging_steps=10,
)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
model.save_pretrained(working_dir + "bert-model/")

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

{'epoch': 20.0,
 'eval_accuracy': 0.915057915057915,
 'eval_loss': 0.5680058598518372,
 'eval_mem_cpu_alloc_delta': 104665,
 'eval_mem_cpu_peaked_delta': 18274,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 2316179968,
 'eval_runtime': 2.5079,
 'eval_samples_per_second': 103.273}

# Bert Predictions

In [None]:
loaded_model = BertForSequenceClassification.from_pretrained(working_dir + "bert-model/")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
y_predict_bert=[]
for item in test_texts:
  inputs = tokenizer(item, padding=True, truncation=True, return_tensors="pt")
  outputs = loaded_model(**inputs)
  y_predict_bert.append(outputs.logits.argmax(-1).item())

In [None]:
print(y_predict_bert)

# Multi-Modal Predictions
 Late Fusion Approach

In [None]:
model_img_request = requests.get("https://raw.githubusercontent.com/agiagoulas/page-stream-segmentation/master/app/pss/model_img.py")
with open("model_img.py", "w") as f:
    f.write(model_img_request.text)
import model_img

In [None]:
img_dim = (224,224)
model_img.img_path_template = working_dir + "Tobacco800_Small/%s.png"
data_image_test = model_img.read_csv_data(working_dir + "tobacco800.test")
model_image = load_model(working_dir + "Tobacco800_exp2_prev-page_repeat-07.hdf5")

In [None]:
prediction_image_test = np.round(model_image.predict(model_img.ImageFeatureGenerator(data_image_test, img_dim, prevpage=True, train=True)))
probability_image_test = np.concatenate([1 - prediction_image_test, prediction_image_test], axis = 1)

In [None]:
prediction_bert_test = np.array(y_predict_bert)
probability_bert_test = np.concatenate([1 - prediction_bert_test.reshape(-1,1), prediction_bert_test.reshape(-1,1)], axis=1)

In [None]:
max_kappa = 0
test_exponents = [x / 10 for x in range(1,11)]
for i in test_exponents:
    for j in test_exponents:
        y_predict = np.argmax(np.power(probability_image_test, i) * np.power(probability_bert_test, j), axis = 1)
        acc = sklm.accuracy_score(test_labels, y_predict)
        kappa = sklm.cohen_kappa_score(test_labels, y_predict)
        if kappa > max_kappa:
          max_kappa = kappa
          print(str(i) + " " + str(j))
          print("Accuracy: " + str(acc))
          print("Kappa: " + str(kappa))

0.1 0.1
Accuracy: 0.9343629343629344
Kappa: 0.8666686854616479
