This notebook will contain data preprocessing of the CROHME dataset, which is the mathematical expression recognition dataset that we will use.

The CROHME dataset contains several .inkml files which will be converted to normal jpeg in order for the model to understand the pictures. First, we will convert our .inkml files to .jpeg and visualize few images and our stage 1 will be complete.

In [22]:
# Importing the necessary modules

import os
import requests
import shutil
import glob
import zipfile
from pathlib import Path
from datetime import datetime

import torch
import torch.nn as nn
import cv2
import numpy
import pandas
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

from skimage.draw import line
from skimage.morphology import thin
from PIL import Image
from io import StringIO

numpy.random.seed(0)

In [17]:
# We will first download our library

data_path = Path.cwd()
image_path = data_path / "dataset"

if image_path.is_dir():
  print(f"{image_path} directory exists.")
else:
  print(f"{image_path} does not exist. Creating one...")
  image_path.mkdir(parents=True, exist_ok=True)

  with open(data_path / "CROHME.zip", "wb") as f:
    request = requests.get("http://www.iapr-tc11.org/dataset/CROHME/CROHME_full_v2.zip")
    f.write(request.content)

/content/dataset directory exists.


In [18]:
# We will now extract our zip file.

with zipfile.ZipFile("/content/CROHME.zip", "r") as zip_ref:
    print("Unzipping...")
    zip_ref.extractall(image_path)

Unzipping...


In [26]:
# The following inkml conversion and extraction has been extracted from vndee
# github repo: https://github.com/vndee/offline-crohme/tree/master
# Cells 3-9 have been extracted from the repo

def get_label(inkml_file_abs_path):
	lebel = ""
	tree = ET.parse(inkml_file_abs_path)
	root = tree.getroot()
	doc_namespace = "{http://www.w3.org/2003/InkML}"

	for child in root:
		if (child.tag == doc_namespace + 'annotation') and (child.attrib == {'type': 'truth'}):
			return child.text

In [20]:
def get_traces_data(inkml_file_abs_path):
  traces_data = []
  tree = ET.parse(inkml_file_abs_path)
  root = tree.getroot()
  doc_namespace = "{http://www.w3.org/2003/InkML}"

  traces_all = [{'id': trace_tag.get('id'),
                 'coords': [[round(float(axis_coord)) if float(axis_coord).is_integer() else round(float(axis_coord) * 10000) \
                             for axis_coord in coord[1:].split(' ')] if coord.startswith(' ') \
    								else [round(float(axis_coord)) if float(axis_coord).is_integer() else round(float(axis_coord) * 10000) \
    									for axis_coord in coord.split(' ')] \
    							for coord in (trace_tag.text).replace('\n', '').split(',')]} \
    							for trace_tag in root.findall(doc_namespace + 'trace')]
  traces_all.sort(key=lambda trace_dict: int(trace_dict['id']))
  traceGroupWrapper = root.find(doc_namespace + 'traceGroup')

  if traceGroupWrapper is not None:
    for traceGroup in traceGroupWrapper.findall(doc_namespace + 'traceGroup'):
        label = traceGroup.find(doc_namespace + 'annotation').text
        traces_curr = []
        for traceView in traceGroup.findall(doc_namespace + 'traceView'):
         traceDataRef = int(traceView.get('traceDataRef'))
         single_trace = traces_all[traceDataRef]['coords']
         traces_curr.append(single_trace)
         traces_data.append({'label': label, 'trace_group': traces_curr})
  else:
     [traces_data.append({'trace_group': [trace['coords']]}) for trace in traces_all]

  return traces_data

In [21]:
def inkml2img(input_path, output_path):
	fout = open(output_path.split('.')[0] + '.txt', 'w+')
	fout.write(get_label(input_path))
	fout.close()

	traces = get_traces_data(input_path)
	plt.gca().invert_yaxis()
	plt.gca().set_aspect('equal', adjustable='box')
	plt.axes().get_xaxis().set_visible(False)
	plt.axes().get_yaxis().set_visible(False)
	plt.axes().spines['top'].set_visible(False)
	plt.axes().spines['right'].set_visible(False)
	plt.axes().spines['bottom'].set_visible(False)
	plt.axes().spines['left'].set_visible(False)
	for elem in traces:
		ls = elem['trace_group']
		for subls in ls:
			data = numpy.array(subls)
			x,y=zip(*data)
			plt.plot(x,y,linewidth=2,c='black')
	plt.savefig(output_path, bbox_inches='tight', dpi=100)
	plt.gcf().clear()


In [23]:
def writeLog(message):
    logger.write("[" + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "] " + str(message) + "\n")

def createDirectory(dirPath):
    if not os.path.exists(dirPath):
        os.mkdir(dirPath)
        writeLog("Create " + dirPath)

In [24]:
dataPath = '/content/dataset'
dataMergedPath = 'data_merged/'
targetFolder = 'data_processed/'
logger = open('log.txt', 'w+')

In [25]:
if __name__ == "__main__":
    writeLog("Start processing.")
    filesPath = glob.glob(dataPath + '*/*.inkml')
    writeLog("There are " + str(len(filesPath)) + " files in " + dataPath)
    createDirectory(dataMergedPath)

    cnt = 0
    for fileName in filesPath:
        cnt = cnt + 1
        print("Copying %d/%d" % (cnt, len(filesPath)))
        writeLog("Copied " + fileName + " --> " + dataMergedPath + fileName)
        shutil.copy2(fileName, dataMergedPath)

    createDirectory(targetFolder)

    listFiles = glob.glob(dataMergedPath + '*.inkml')
    numberOfFile = len(listFiles)
    writeLog("There are " + str(numberOfFile) + " files in " + dataMergedPath)
    cnt = 0

    for fileInkml in listFiles:
        cnt = cnt + 1
        fileName = fileInkml.split('/')[1]
        print("Processing %s [%d/%d]" % (fileName, cnt, numberOfFile))
        writeLog("[" + str(cnt) + "/" + str(numberOfFile) + "]" + "Processed " + fileInkml + " --> " + targetFolder + fileName + ".png")
        try:
            inkml2img.inkml2img(fileInkml, targetFolder + fileName + '.png')
        except:
            writeLog("Failed!")
            print("An error occured!")

        writeLog("Successful!")