Skip to content

Commit

Permalink
[page-orientation] - refactor to avoid running pdf2txt process on som…
Browse files Browse the repository at this point in the history
…e tests
  • Loading branch information
marianorodriguez committed Feb 19, 2020
1 parent 8ae0700 commit c00ced4
Show file tree
Hide file tree
Showing 9 changed files with 173,376 additions and 164 deletions.
26 changes: 2 additions & 24 deletions server/src/input/pdfminer/PdfminerExtractor.ts
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/
import * as limit from 'limit-async';
import { Character, Document, Page } from '../../types/DocumentRepresentation';
import { Document } from '../../types/DocumentRepresentation';
import * as CommandExecuter from '../../utils/CommandExecuter';
import logger from '../../utils/Logger';
import { extractImagesAndFonts } from '../extractImagesFonts';
Expand Down Expand Up @@ -120,7 +120,7 @@ export class PdfminerExtractor extends Extractor {
return async (doc: Document): Promise<Document> => {
doc.inputFile = inputFile;
const startTime: number = Date.now();
const pageRotations = doc.pages.map(this.getPageRotation).reduce(this.groupByRotation, {});
const pageRotations = doc.pages.map(p => p.getMainRotationAngle()).reduce(this.groupByRotation, {});
const promises = Object.keys(pageRotations)
.filter(r => r !== '0')
.map(rotation => limiter(this.rotatePages)(doc, pageRotations[rotation], rotation));
Expand All @@ -131,28 +131,6 @@ export class PdfminerExtractor extends Extractor {
};
}

private getPageRotation(page: Page): number {
const rotations = page.elements.map((word) => {
if (Array.isArray(word.content) && word.content.length > 1) {
const { left: x1, bottom: y1 } = word.content[0] as Character;
const { left: x2, bottom: y2 } = word.content[word.content.length - 1] as Character;
const arcTan = Math.round(Math.atan((y1 - y2) / (x1 - x2)) * 180 / Math.PI);
return arcTan === 0 ? (x1 < x2 ? 0 : 180) : arcTan;
}
return 0;
});

const elementsPerRotation = rotations.reduce((acc, value) => {
acc[value] = acc[value] || 0;
acc[value] += 1;
return acc;
}, {});

const highestValue: number = Math.max(...(Object.values(elementsPerRotation) as number[]));
const mainRotation = Object.keys(elementsPerRotation).find(k => elementsPerRotation[k] === highestValue);
return parseInt(mainRotation, 10);
}

private groupByRotation(acc, value, index): any {
acc[value] = acc[value] || [];
acc[value].push(index + 1);
Expand Down
27 changes: 26 additions & 1 deletion server/src/types/DocumentRepresentation/Page.ts
@@ -1,5 +1,5 @@
/**
* Copyright 2019 AXA Group Operations S.A.
* Copyright 2020 AXA Group Operations S.A.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,10 +18,12 @@ import { RotationCorrection } from '../../input/OcrExtractor';
import { findMostCommonFont, isInBox } from '../../utils';
import logger from '../../utils/Logger';
import { BoundingBox } from './BoundingBox';
import { Character } from './Character';
import { Element } from './Element';
import { Font } from './Font';
import { Paragraph } from './Paragraph';
import { Text } from './Text';
import { Word } from './Word';

export type directionType = 'horizontal' | 'vertical';
/**
Expand Down Expand Up @@ -71,6 +73,29 @@ export class Page {
this.computePageOccupancy();
}

public getMainRotationAngle(): number {
const rotations = this.getElementsOfType<Word>(Word, true)
.map((word) => {
if (Array.isArray(word.content) && word.content.length > 1) {
const { left: x1, bottom: y1 } = word.content[0] as Character;
const { left: x2, bottom: y2 } = word.content[word.content.length - 1] as Character;
const arcTan = Math.round(Math.atan((y1 - y2) / (x1 - x2)) * 180 / Math.PI);
return arcTan === 0 ? (x1 < x2 ? 0 : 180) : arcTan;
}
return 0;
});

const elementsPerRotation = rotations.reduce((acc, value) => {
acc[value] = acc[value] || 0;
acc[value] += 1;
return acc;
}, {});

const highestValue: number = Math.max(...(Object.values(elementsPerRotation) as number[]));
const mainRotation = Object.keys(elementsPerRotation).find(k => elementsPerRotation[k] === highestValue);
return parseInt(mainRotation, 10);
}

/**
* Computes all horizontal and vertical page occupancies
*/
Expand Down
191 changes: 115 additions & 76 deletions server/src/utils/json2document.ts
@@ -1,5 +1,5 @@
/**
* Copyright 2019 AXA Group Operations S.A.
* Copyright 2020 AXA Group Operations S.A.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -185,7 +185,7 @@ function propertiesFromJson(propertiesObj: JsonProperties): Properties {
if (propertiesObj.hasOwnProperty('order')) {
prop.order = propertiesObj.order;
} else {
logger.info(
logger.debug(
`the properties obj inputted does not have the order key: ${prettifyObject(propertiesObj)}`,
);
}
Expand Down Expand Up @@ -325,88 +325,127 @@ function barcodeFromJson(barcodeObj: JsonElement): Barcode {
}

function textFromJson(textObj: JsonElement, fonts: Font[]): Text {
switch (textObj.type) {
case 'paragraph':
return buildParagraph(textObj, fonts);
case 'heading':
return buildHeading(textObj, fonts);
case 'line':
return buildLine(textObj, fonts);
case 'word':
return buildWord(textObj, fonts);
case 'character':
return buildCharacter(textObj, fonts);
default:
logger.error('[JsonExtractor] Cannot extract from object of type', textObj.type);
throw new Error(`Illegal Json: Unknown text block ${textObj.type}`);
}
}

function buildParagraph(textObj: JsonElement, fonts: Font[]): Paragraph {
let linesDS: Line[] = [];
if (Array.isArray(textObj.content)) {
linesDS = textObj.content.map(contentObj => {
const obj: Text = textFromJson(contentObj, fonts);
if (obj instanceof Line) {
return obj;
} else {
throw new Error('Illegal Json: paragraphs should only contain lines.');
}
});
}

if (textObj.type === 'paragraph' || textObj.type === 'heading') {
if (Array.isArray(textObj.content)) {
linesDS = textObj.content.map(contentObj => {
const obj: Text = textFromJson(contentObj, fonts);
if (obj instanceof Line) {
return obj;
} else {
throw new Error('Illegal Json: paragraphs should only contain lines.');
}
});
}
const newParagraph = new Paragraph(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
linesDS,
);

newParagraph.id = textObj.id;
newParagraph.properties = propertiesFromJson(textObj.properties);
return newParagraph;
}

function buildHeading(textObj: JsonElement, fonts: Font[]): Heading {
let linesDS: Line[] = [];
if (Array.isArray(textObj.content)) {
linesDS = textObj.content.map(contentObj => {
const obj: Text = textFromJson(contentObj, fonts);
if (obj instanceof Line) {
return obj;
} else {
throw new Error('Illegal Json: headings should only contain lines.');
}
});
}

if (textObj.type === 'paragraph') {
const newParagraph = new Paragraph(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
linesDS,
);
const newHeading: Heading = new Heading(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
linesDS,
textObj.level,
);
newHeading.id = textObj.id;
newHeading.properties = propertiesFromJson(textObj.properties);
return newHeading;
}

newParagraph.id = textObj.id;
newParagraph.properties = propertiesFromJson(textObj.properties);
return newParagraph;
} else if (textObj.type === 'heading') {
const newHeading: Heading = new Heading(
function buildLine(textObj: JsonElement, fonts: Font[]): Line {
let wordsDS: Word[] = [];
if (Array.isArray(textObj.content)) {
wordsDS = textObj.content.map(contentObj => {
const obj: Text = textFromJson(contentObj, fonts);
if (obj instanceof Word) {
return obj;
} else {
throw new Error('Illegal Json: lines should only contain words.');
}
});
}
const newLine: Line = new Line(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
wordsDS,
);
newLine.id = textObj.id;
newLine.properties = propertiesFromJson(textObj.properties);
return newLine;
}

function buildWord(textObj: JsonElement, fonts: Font[]): Word {
if (typeof textObj.content === 'object') {
const charsDS: Character[] = textObj.content.map(contentObj => {
const obj: Text = textFromJson(contentObj, fonts);
if (obj instanceof Character) {
return obj;
} else {
throw new Error('Illegal Json: words should only contain characters.');
}
});
const newWord: Word = new Word(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
linesDS,
textObj.level,
charsDS,
fonts[textObj.font],
);
newHeading.id = textObj.id;
newHeading.properties = propertiesFromJson(textObj.properties);
return newHeading;
} else if (textObj.type === 'line') {
let wordsDS: Word[] = [];
if (Array.isArray(textObj.content)) {
wordsDS = textObj.content.map(contentObj => {
const obj: Text = textFromJson(contentObj, fonts);
if (obj instanceof Word) {
return obj;
} else {
throw new Error('Illegal Json: lines should only contain words.');
}
});
}
const newLine: Line = new Line(
newWord.id = textObj.id;
newWord.properties = propertiesFromJson(textObj.properties);
return newWord;
} else {
const newWord: Word = new Word(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
wordsDS,
textObj.content,
fonts[textObj.font],
);
newLine.id = textObj.id;
newLine.properties = propertiesFromJson(textObj.properties);
return newLine;
} else if (textObj.type === 'word') {
if (typeof textObj.content === 'object') {
const charsDS: Character[] = textObj.content.map(contentObj => {
const obj: Text = textFromJson(contentObj, fonts);
if (obj instanceof Character) {
return obj;
} else {
throw new Error('Illegal Json: words should only contain characters.');
}
});
const newWord: Word = new Word(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
charsDS,
fonts[textObj.font],
);
newWord.id = textObj.id;
newWord.properties = propertiesFromJson(textObj.properties);
return newWord;
} else {
const newWord: Word = new Word(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
textObj.content,
fonts[textObj.font],
);
newWord.id = textObj.id;
newWord.properties = propertiesFromJson(textObj.properties);
return newWord;
}
} else {
logger.error('[JsonExtractor] Cannot extract from object of type', textObj.type);
throw new Error(`Illegal Json: Unknown text block ${textObj.type}`);
newWord.id = textObj.id;
newWord.properties = propertiesFromJson(textObj.properties);
return newWord;
}
}

function buildCharacter(textObj: JsonElement, fonts: Font[]): Character {
const newChar: Character = new Character(
new BoundingBox(textObj.box.l, textObj.box.t, textObj.box.w, textObj.box.h),
textObj.content.toString(),
fonts[textObj.font],
);
newChar.id = textObj.id;
newChar.properties = propertiesFromJson(textObj.properties);
return newChar;
}

0 comments on commit c00ced4

Please sign in to comment.