Skip to content

Commit

Permalink
Merge pull request #325 from axa-group/feature/page-orientation
Browse files Browse the repository at this point in the history
Feature/page orientation
  • Loading branch information
jvalls-axa committed Feb 20, 2020
2 parents 25b6874 + 29e7679 commit c8233ca
Show file tree
Hide file tree
Showing 13 changed files with 173,451 additions and 111 deletions.
33 changes: 15 additions & 18 deletions demo/vue-viewer/src/components/DocumentPreview/Page.vue
Expand Up @@ -3,7 +3,18 @@
<svg
class="Page"
:id="'SVG_' + page.pageNumber"
:style="{ zoom: zoom * zoomToFitPage }"
:style="{
zoom: zoom * zoomToFitPage,
transform:
'translateX(' +
page.rotation.translation.x +
'px) translateY(' +
page.rotation.translation.y +
'px) rotate(' +
page.rotation.degrees +
'deg)',
transformOrigin: page.rotation.origin.x + 'px ' + page.rotation.origin.y + 'px',
}"
:width="page.box.w"
:height="page.box.h"
>
Expand Down Expand Up @@ -41,19 +52,7 @@
style="stroke: #aeaeae"
/>
</svg>
<g
:style="{
transform:
'translateX(' +
page.rotation.translation.x +
'px) translateY(' +
page.rotation.translation.y +
'px) rotate(' +
page.rotation.degrees +
'deg)',
transformOrigin: page.rotation.origin.x + 'px ' + page.rotation.origin.y + 'px',
}"
>
<g>
<imageData
v-for="element in images"
:key="element.id"
Expand Down Expand Up @@ -267,16 +266,14 @@ export default {
<style lang="scss">
.PageContainer {
margin: 0 auto;
display: flex;
align-items: center;
}
.Page {
background-color: white;
border: 1px solid rgb(204, 204, 231);
margin: 0 auto;
position: relative;
top: 50%;
-webkit-transform: translateY(-50%);
-ms-transform: translateY(-50%);
transform: translateY(-50%);
}
.Page text {
Expand Down
1 change: 1 addition & 0 deletions demo/vue-viewer/src/components/Thumbnails.vue
Expand Up @@ -137,6 +137,7 @@ export default {
top: 38px;
left: 0;
right: 0;
zoom: 0.9;
}
.Thumb:last-of-type {
Expand Down
1 change: 0 additions & 1 deletion demo/vue-viewer/src/components/Thumbnails/Thumbnail.vue
Expand Up @@ -54,7 +54,6 @@ export default {
box-sizing: border-box;
}
.Thumb img {
max-width: 80%;
max-height: 100%;
box-shadow: 0 0 2px 1px #ebebf1;
margin: 10px;
Expand Down
54 changes: 50 additions & 4 deletions server/src/input/pdfminer/PdfminerExtractor.ts
Expand Up @@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import * as limit from 'limit-async';
import { Document } from '../../types/DocumentRepresentation';
import * as CommandExecuter from '../../utils/CommandExecuter';
import logger from '../../utils/Logger';
Expand Down Expand Up @@ -46,7 +46,7 @@ export class PdfminerExtractor extends Extractor {
const totalSeconds = (Date.now() - startTime) / 1000;
logger.info(
`Total PdfMiner ${
totalPages != null ? '(' + totalPages.toString() + ')' : ''
totalPages != null ? '(' + totalPages.toString() + ')' : ''
} time: ${totalSeconds} sec - ${totalSeconds / 60} min`,
);
return doc;
Expand Down Expand Up @@ -78,8 +78,9 @@ export class PdfminerExtractor extends Extractor {
const extractPages = this.pagesToExtract(pageIndex, maxPages, totalPages);
return pdfminer
.extractPages(inputFile, totalPages != null ? extractPages : null)
.then((xmlOutputFile: string) => pdfminer.xmlParser(xmlOutputFile))
.then((json: any) => pdfminer.jsParser(json))
.then(pdfminer.xmlParser)
.then(pdfminer.jsParser)
.then(this.detectAndFixPageRotation(inputFile))
.then((doc: Document) => {
document.pages = document.pages.concat(doc.pages);
document.pages.forEach((page, index) => (page.pageNumber = index + 1));
Expand All @@ -91,6 +92,51 @@ export class PdfminerExtractor extends Extractor {
});
}

private async rotatePages(doc: Document, pages: number[], rotation: number): Promise<Document> {
logger.info(`pages ${pages.join(', ')} will be reprocessed with a rotation angle of ${rotation} degrees`);

const fixedDoc: Document = await pdfminer
.extractPages(doc.inputFile, pages.join(','), rotation)
.then(pdfminer.xmlParser)
.then(pdfminer.jsParser);
pages.forEach(pageNumber => {
doc.pages[pageNumber - 1] = fixedDoc.pages[pages.indexOf(pageNumber)];
doc.pages[pageNumber - 1].pageRotation = {
fileName: doc.inputFile,
degrees: -rotation,
translation: { x: 0, y: 0 },
origin: {
x: doc.pages[pageNumber - 1].width / 2,
y: doc.pages[pageNumber - 1].height / 2,
},
};

});
return doc;
}

private detectAndFixPageRotation(inputFile: string): (doc: Document) => Promise<Document> {
const limiter = limit(1);
return async (doc: Document): Promise<Document> => {
doc.inputFile = inputFile;
const startTime: number = Date.now();
const pageRotations = doc.pages.map(p => p.getMainRotationAngle()).reduce(this.groupByRotation, {});
const promises = Object.keys(pageRotations)
.filter(r => r !== '0')
.map(rotation => limiter(this.rotatePages)(doc, pageRotations[rotation], rotation));

await Promise.all(promises);
logger.info(`Page rotation detection and correction finished in ${(Date.now() - startTime) / 1000}s`);
return doc;
};
}

private groupByRotation(acc, value, index): any {
acc[value] = acc[value] || [];
acc[value].push(index + 1);
return acc;
}

private pagesToExtract(pageIndex: number, maxPages: number, totalPages: number) {
const fromPage = (pageIndex - 1) * maxPages;
return [...Array(maxPages).keys()]
Expand Down
4 changes: 2 additions & 2 deletions server/src/input/pdfminer/pdfminer.ts
Expand Up @@ -41,10 +41,10 @@ import logger from '../../utils/Logger';
* @returns The promise of a valid document (in the format DocumentRepresentation).
*/

export function extractPages(pdfInputFile: string, pages: string): Promise<string> {
export function extractPages(pdfInputFile: string, pages: string, rotationDegrees: number = 0): Promise<string> {
return new Promise<string>((resolveXml, rejectXml) => {
const startTime: number = Date.now();
CommandExecuter.pdfMinerExtract(pdfInputFile, pages)
CommandExecuter.pdfMinerExtract(pdfInputFile, pages, rotationDegrees)
.then(xmlOutputPath => {
logger.info(`PdfMiner xml: ${(Date.now() - startTime) / 1000}s`);
try {
Expand Down
27 changes: 26 additions & 1 deletion server/src/types/DocumentRepresentation/Page.ts
@@ -1,5 +1,5 @@
/**
* Copyright 2019 AXA Group Operations S.A.
* Copyright 2020 AXA Group Operations S.A.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,10 +18,12 @@ import { RotationCorrection } from '../../input/OcrExtractor';
import { findMostCommonFont, isInBox } from '../../utils';
import logger from '../../utils/Logger';
import { BoundingBox } from './BoundingBox';
import { Character } from './Character';
import { Element } from './Element';
import { Font } from './Font';
import { Paragraph } from './Paragraph';
import { Text } from './Text';
import { Word } from './Word';

export type directionType = 'horizontal' | 'vertical';
/**
Expand Down Expand Up @@ -71,6 +73,29 @@ export class Page {
this.computePageOccupancy();
}

public getMainRotationAngle(): number {
const rotations = this.getElementsOfType<Word>(Word, true)
.map((word) => {
if (Array.isArray(word.content) && word.content.length > 1) {
const { left: x1, bottom: y1 } = word.content[0] as Character;
const { left: x2, bottom: y2 } = word.content[word.content.length - 1] as Character;
const arcTan = Math.round(Math.atan((y1 - y2) / (x1 - x2)) * 180 / Math.PI);
return arcTan === 0 ? (x1 < x2 ? 0 : 180) : arcTan;
}
return 0;
});

const elementsPerRotation = rotations.reduce((acc, value) => {
acc[value] = acc[value] || 0;
acc[value] += 1;
return acc;
}, {});

const highestValue: number = Math.max(...(Object.values(elementsPerRotation) as number[]));
const mainRotation = Object.keys(elementsPerRotation).find(k => elementsPerRotation[k] === highestValue);
return parseInt(mainRotation, 10);
}

/**
* Computes all horizontal and vertical page occupancies
*/
Expand Down
11 changes: 9 additions & 2 deletions server/src/utils/CommandExecuter.ts
Expand Up @@ -180,9 +180,16 @@ export async function detectTables(
});
}

export async function pdfMinerExtract(filePath: string, pages: string): Promise<string> {
export async function pdfMinerExtract(filePath: string, pages: string, rotationDegrees: number = 0): Promise<string> {
const xmlOutputFile: string = getTemporaryFile('.xml');
let pdf2txtArguments: string[] = ['-c', 'utf-8', '-t', 'xml', '-o', xmlOutputFile, filePath];
let pdf2txtArguments: string[] = [
'--detect-vertical',
'-R', rotationDegrees.toString(),
'-c', 'utf-8',
'-t', 'xml',
'-o', xmlOutputFile,
filePath,
];

if (pages != null) {
pdf2txtArguments = ['-p', pages].concat(pdf2txtArguments);
Expand Down

0 comments on commit c8233ca

Please sign in to comment.