Skip to content

Commit

Permalink
Add minimum column width as an option to reading order (#2)
Browse files Browse the repository at this point in the history
* CHANGE: Improve reading order to be able to take into account a minimum column width (default 5% of page)

* linter compatibility fixes
  • Loading branch information
hexapode authored and Aarohi Johal committed Aug 16, 2019
1 parent 97b20bc commit 95efdd2
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 3 deletions.
5 changes: 5 additions & 0 deletions docs/modules/reading-order-detection-module.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ First, the algorithm will try to find possible vertical cuts in the page between

Good

## Options

minColumnWidthInPagePercent : The minimal column width in % of the page width
minVerticalGapWidth : The minimal gap width in points

## Limitations

- It sometimes fails if bounding boxes are too far from each others.
Expand Down
47 changes: 44 additions & 3 deletions server/src/modules/ReadingOrderDetectionModule.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,19 @@ import { Module } from './Module';
*/

interface Options {
minWidth?: number;
minColumnWidthInPagePercent?: number;
minVerticalGapWidth?: number;
}

const defaultOptions: Options = {
minWidth: 5,
minVerticalGapWidth: 5,
minColumnWidthInPagePercent: 5,
};

export class ReadingOrderDetectionModule extends Module<Options> {
public static moduleName = 'reading-order-detection';
private order: number = 0;
private currentPageMinColumnWidth: number = 5;

constructor(options?: Options) {
super(options, defaultOptions);
Expand All @@ -47,6 +50,11 @@ export class ReadingOrderDetectionModule extends Module<Options> {
const elements: Element[] = page.elements.filter(Element.hasBoundingBox);

this.order = 0;
// The min width is actually as a % of page width
this.currentPageMinColumnWidth = Math.trunc(
(this.options.minColumnWidthInPagePercent / 100) * page.width,
);

this.process(elements);

elements.sort(utils.sortElementsByOrder);
Expand Down Expand Up @@ -184,7 +192,7 @@ export class ReadingOrderDetectionModule extends Module<Options> {

elementsRest.forEach(e => {
if (
e.left <= rightmost + this.options.minWidth &&
e.left <= rightmost + this.options.minVerticalGapWidth &&
e.left >= startGroup &&
!group.includes(e)
) {
Expand All @@ -199,6 +207,39 @@ export class ReadingOrderDetectionModule extends Module<Options> {
group = [];
}

// Merge to right to small collumn

for (let i = 0; i < elementsGroups.length; ++i) {
// calculate group width
const columnWidth = this.calcualteGroupWidth(elementsGroups[i]);
if (columnWidth < this.currentPageMinColumnWidth) {
if (i < elementsGroups.length - 1) {
// default Right merge
elementsGroups[i + 1] = [...elementsGroups[i], ...elementsGroups[i + 1]];
elementsGroups.splice(i--, 1);
} else if (i > 0) {
// if no rightMerge possible try left merge
elementsGroups[i - 1] = [...elementsGroups[i - 1], ...elementsGroups[i]];
elementsGroups.splice(i--, 1);
}
}
}

return elementsGroups;
}

private calcualteGroupWidth(elements: Element[]) {
let minX = elements[0].left;
let maxX = elements[0].left + elements[0].width;

for (let i = 1; i < elements.length; ++i) {
if (elements[i].left < minX) {
minX = elements[i].left;
}
if (elements[i].left + elements[i].width > maxX) {
maxX = elements[i].left + elements[i].width;
}
}
return maxX - minX;
}
}

0 comments on commit 95efdd2

Please sign in to comment.