<a href="https://colab.research.google.com/github/agiagoulas/page-stream-segmentation/blob/master/Model%20Training/TextModel_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup & Imports

Connect to Google Drive when working in Google Colab

In [19]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Set working directory 

In [38]:
working_dir = "/sample_directory/" # TODO: Set correct working directory

Imports

In [None]:
!git clone https://github.com/facebookresearch/fastText.git
!pip install fastText/.
import csv, re, math
import sklearn.metrics as sklm
import fasttext
import numpy as np
import requests

from keras.callbacks import ModelCheckpoint
from importlib import reload
from keras.utils import Sequence
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import *
from keras.callbacks import ModelCheckpoint, Callback

Private Imports

In [5]:
model_request = requests.get("https://raw.githubusercontent.com/agiagoulas/page-stream-segmentation/master/app/pss/model.py")
with open("model.py", "w") as f:
    f.write(model_request.text)
import model

Get fasttext word vectors

In [None]:
if 'ft' not in locals():
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
    !unzip wiki.en.zip
    ft = fasttext.load_model("wiki.en.bin")
model.ft = ft

Load Tobacco800 Data

In [66]:
data_train = model.read_csv_data(working_dir + "tobacco800.train")
data_test = model.read_csv_data(working_dir + "tobacco800.test")

# Single Page Model Training

Model Training

In [None]:
n_repeats = 1
n_epochs = 1
single_page_metric_history = []
optimize_for = 'kappa'

for i in range(n_repeats):
    print("Repeat " + str(i+1) + " of " + str(n_repeats))
    print("--------------------")
    model_singlepage = model.compile_model_singlepage()
    model_file = working_dir + "tobacco800_text_single-page_%02d.hdf5" % (i)
    print(model_file)
    checkpoint = model.ValidationCheckpoint(model_file, data_test, prev_page_generator=False, metric=optimize_for)
    model_singlepage.fit(model.TextFeatureGenerator(data_train, prevpage=False, train=True),
                    callbacks = [checkpoint],
                    epochs = n_epochs)
    single_page_metric_history.append(checkpoint.max_metrics)

print(single_page_metric_history)

Show metric results from different models

In [None]:
for i, r in enumerate(single_page_metric_history):
    model_file = working_dir + "tobacco800_text_single-page_%02d.hdf5" % (i)
    print(str(i) + ' ' + str(r['kappa']) + ' ' + str(r['accuracy']) + ' ' + str(r['f1_micro']) + ' ' + str(r['f1_macro']) + ' ' +  model_file)

Load model and generate prediction

In [None]:
model_singlepage = model.compile_model_singlepage()
model_singlepage.load_weights(working_dir + "tobacco800_text_single-page_00.hdf5")
y_predict = np.round(model_singlepage.predict(model.TextFeatureGenerator(data_test, prevpage=False, train=False)))
y_true = [model.LABEL2IDX[x[3]] for x in data_test]

print("Accuracy: " + str(sklm.accuracy_score(y_true, y_predict)))
print("Kappa: " + str(sklm.cohen_kappa_score(y_true, y_predict)))
print("F1 Micro " + str(sklm.f1_score(y_true, y_predict, average='micro')))
print("F1 Macro " + str(sklm.f1_score(y_true, y_predict, average='macro')))

# Current & Prev Page Model Training

Model Training

In [None]:
n_repeats = 1 # 10
n_epochs = 1 # 20
prev_page_metric_history = []
optimize_for = 'kappa'

for i in range(n_repeats):
    print("Repeat " + str(i+1) + " of " + str(n_repeats))
    print("--------------------")
    model_prevpage = model.compile_model_prevpage()
    model_file = working_dir + "tobacco800_text_prev-page_%02d.hdf5" % (i)
    print(model_file)
    checkpoint = model.ValidationCheckpoint(model_file, data_test, prev_page_generator=True, metric=optimize_for)
    model_prevpage.fit(model.TextFeatureGenerator(data_train, prevpage=True, train=True),
                    callbacks = [checkpoint],
                    epochs = n_epochs)
    prev_page_metric_history.append(checkpoint.max_metrics)

print(prev_page_metric_history)

Show metric results from different models

In [None]:
for i, r in enumerate(prev_page_metric_history):
    model_file = working_dir + "tobacco800_text_prev-page_%02d.hdf5" % (i)
    print(str(i) + ' ' + str(r['kappa']) + ' ' + str(r['accuracy']) + ' ' + str(r['f1_micro']) + ' ' + str(r['f1_macro']) + ' ' +  model_file)

Load model and generate prediction

In [None]:
model_prevpage = model.compile_model_prevpage()
model_prevpage.load_weights(working_dir + "tobacco800_text_prev-page_00.hdf5")
y_predict = np.round(model_prevpage.predict(model.TextFeatureGenerator(data_test, prevpage=True, train=False)))
y_true = [model.LABEL2IDX[x[3]] for x in data_test]

print("Accuracy: " + str(sklm.accuracy_score(y_true, y_predict)))
print("Kappa: " + str(sklm.cohen_kappa_score(y_true, y_predict)))
print("F1 Micro " + str(sklm.f1_score(y_true, y_predict, average='micro')))
print("F1 Macro " + str(sklm.f1_score(y_true, y_predict, average='macro')))