Skip to content
Permalink
Browse files

[ocr-credentials] - credentials for OCR services are passed in the JS…

…ON config file instead of passing them as environment variables. Credential verification is executed in each extractor constructor
  • Loading branch information
marianorodriguez committed Feb 10, 2020
1 parent c96b5e5 commit 92c2d9e5df34ca1573d648d1e5a8ea21830ade91
@@ -25,11 +25,7 @@
"-p"
],
"env": {
"NODE_DEBUG": "pipeline",
"GOOGLE_APPLICATION_CREDENTIALS": "${workspaceRoot}/***.json",
"OCP_APIM_SUBSCRIPTION_KEY": "",
"AWS_ACCESS_KEY_ID": "",
"AWS_SECRET_ACCESS_KEY": "",
"NODE_DEBUG": "pipeline"
},
"outputCapture": "std"
}
@@ -14,6 +14,7 @@
* limitations under the License.
*/

import { existsSync } from 'fs';
import { Config } from '../types/Config';
import { Document } from '../types/DocumentRepresentation/Document';

@@ -26,9 +27,50 @@ import { Document } from '../types/DocumentRepresentation/Document';
export abstract class Extractor {
public config: Config;

constructor(config: Config) {
constructor(config: Config, credentials: any = {}) {
this.config = config;

if (!this.config.extractor.credentials) {
this.config.extractor.credentials = {};
}
Object.keys(credentials).forEach(key => {
this.config.extractor.credentials[key] = credentials[key];
});
}

public checkCredentials(required: string[]) {
const missingCredentials: string[] = required.filter(c => !this.config.extractor.credentials[c]);
if (missingCredentials.length > 0) {
throw new Error(`Required credentials not found: ${missingCredentials.join(', ')}. Make sure you set it in the extractor configuration:
${
JSON.stringify({
extractor: {
pdf: '...',
ocr: '...',
language: [],
credentials: {
...missingCredentials.reduce((acc, cred) => {
acc[cred] = '...';
return acc;
}, {}),
},
},
}, null, 2)
}`,
);
}
}

public checkCredentialAsFile(credential: string, format: string) {
const filePath = this.config.extractor.credentials[credential];
const fileExists = existsSync(filePath);
if (!fileExists || !filePath.endsWith(format)) {
throw new Error(
`${credential} must be a path to a ${format} file.`,
);
}
}

public abstract run(inputFile: string): Promise<Document>;

}
@@ -16,6 +16,7 @@

import { parseString } from 'xml2js';
import { ListDetectionModule } from '../../processing/ListDetectionModule/ListDetectionModule';
import { Config } from '../../types/Config';
import {
Barcode,
BoundingBox,
@@ -40,6 +41,7 @@ import * as utils from '../../utils';
import logger from '../../utils/Logger';
import { Extractor } from '../Extractor';
import { AbbyyClient } from './AbbyyClient';
import * as credentials from './credentials.json';

export class AbbyyTools extends Extractor {
/**
@@ -59,6 +61,15 @@ export class AbbyyTools extends Extractor {
}
private _fonts: Font[] = [];

constructor(config: Config) {
super(config, credentials);
this.checkCredentials([
'ABBYY_SERVER_URL',
'ABBYY_SERVER_VER',
'ABBYY_WORKFLOW',
]);
}

public abbyyXMLToObject(xml: string): Promise<object> {
const promise = new Promise<object>((resolve, reject) => {
parseString(xml, (err, dataObject) => {
@@ -72,9 +83,9 @@ export class AbbyyTools extends Extractor {
}

public run(inputFile: string): Promise<Document> {
const host: string = process.env.ABBYY_SERVER_URL; // 172.23.132.137
const serverVersion: string = process.env.ABBYY_SERVER_VER; // 14
const workflowName: string = process.env.ABBYY_WORKFLOW; // workflow-hotfolder-d_drive
const host: string = this.config.extractor.credentials.ABBYY_SERVER_URL; // 172.23.132.137
const serverVersion: string = this.config.extractor.credentials.ABBYY_SERVER_VER; // 14
const workflowName: string = this.config.extractor.credentials.ABBYY_WORKFLOW; // workflow-hotfolder-d_drive
const serverTimeout: number = 50000;
const jobPollingInterval: number = 1000;

@@ -0,0 +1,5 @@
{
"ABBYY_SERVER_URL": "",
"ABBYY_SERVER_VER": "",
"ABBYY_WORKFLOW": ""
}
@@ -21,6 +21,7 @@ import { Config } from '../../types/Config';
import { BoundingBox, Document, Font, Line, Page, Word } from '../../types/DocumentRepresentation';
import { OcrExtractorFactory } from '../OcrExtractor';
import { setPageDimensions } from '../set-page-dimensions';
import * as credentials from './credentials.json';

type AmazonTextractResponse = {
DocumentMetadata: {
@@ -55,20 +56,15 @@ export class AmazonTextractExtractor extends OcrExtractorFactory {
private textract = null;

constructor(config: Config) {
super(config);
if (!process.env.AWS_ACCESS_KEY_ID) {
throw new Error(
`Required environment variable AWS_ACCESS_KEY_ID not found. Make sure you set it as 'AWS_ACCESS_KEY_ID=<KEY>' before running the tool.`,
);
}
if (!process.env.AWS_SECRET_ACCESS_KEY) {
throw new Error(
`Required environment variable AWS_SECRET_ACCESS_KEY not found. Make sure you set it as 'AWS_SECRET_ACCESS_KEY=<KEY>' before running the tool.`,
);
}
super(config, credentials);
this.checkCredentials([
'AWS_ACCESS_KEY_ID',
'AWS_SECRET_ACCESS_KEY',
]);

this.textract = new Textract({
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
accessKeyId: this.config.extractor.credentials.AWS_ACCESS_KEY_ID,
secretAccessKey: this.config.extractor.credentials.AWS_SECRET_ACCESS_KEY,
region: 'us-east-1',
});
}
@@ -0,0 +1,4 @@
{
"AWS_ACCESS_KEY_ID": "",
"AWS_SECRET_ACCESS_KEY": ""
}
@@ -1,4 +1,5 @@
import * as vision from '@google-cloud/vision';
import { Config } from '../../types/Config';
import {
BoundingBox,
Character,
@@ -11,6 +12,7 @@ import {
Word,
} from '../../types/DocumentRepresentation';
import { OcrExtractorFactory } from '../OcrExtractor';
import * as credentials from './credentials.json';

type GoogleVisionResponse = Array<{
fullTextAnnotation: FullTextAnnotation;
@@ -105,6 +107,17 @@ type TextAnnotation = {
* An extractor class to extract content from images using Google Vision
*/
export class GoogleVisionExtractor extends OcrExtractorFactory {

constructor(config: Config) {
super(config, credentials);
this.checkCredentials([
'GOOGLE_APPLICATION_CREDENTIALS',
]);
this.checkCredentialAsFile('GOOGLE_APPLICATION_CREDENTIALS', 'json');

process.env.GOOGLE_APPLICATION_CREDENTIALS = this.config.extractor.credentials.GOOGLE_APPLICATION_CREDENTIALS;
}

public async scanImage(inputFile: string) {
const client = new vision.ImageAnnotatorClient();
const result: GoogleVisionResponse = await client.documentTextDetection(inputFile);
@@ -0,0 +1,3 @@
{
"GOOGLE_APPLICATION_CREDENTIALS": ""
}
@@ -19,6 +19,7 @@ import { readFileSync } from 'fs';
import { Config } from '../../types/Config';
import { BoundingBox, Document, Font, Line, Page, Word } from '../../types/DocumentRepresentation';
import { OcrExtractorFactory } from '../OcrExtractor';
import * as credentials from './credentials.json';

type MSCognitiveServicesResponse = {
status: 'NotStarted' | 'Running' | 'Failed' | 'Succeeded';
@@ -43,18 +44,17 @@ export class MicrosoftCognitiveExtractor extends OcrExtractorFactory {
private apiClient: AxiosInstance = null;

constructor(config: Config) {
super(config);
if (!process.env.OCP_APIM_SUBSCRIPTION_KEY) {
throw new Error(
`Required environment variable OCP_APIM_SUBSCRIPTION_KEY not found. Make sure you set it as 'OCP_APIM_SUBSCRIPTION_KEY=<API_KEY>' before running the tool.`,
);
}
super(config, credentials);
this.checkCredentials([
'OCP_APIM_SUBSCRIPTION_KEY',
'OCP_APIM_ENDPOINT',
]);

this.apiClient = axios.create({
baseURL: process.env.OCP_APIM_ENDPOINT || 'https://westeurope.api.cognitive.microsoft.com/',
baseURL: this.config.extractor.credentials.OCP_APIM_ENDPOINT,
headers: {
'Content-Type': 'application/octet-stream',
'Ocp-Apim-Subscription-Key': process.env.OCP_APIM_SUBSCRIPTION_KEY,
'Ocp-Apim-Subscription-Key': this.config.extractor.credentials.OCP_APIM_SUBSCRIPTION_KEY,
},
timeout: 20000,
});
@@ -0,0 +1,4 @@
{
"OCP_APIM_SUBSCRIPTION_KEY": "",
"OCP_APIM_ENDPOINT": "https://westeurope.api.cognitive.microsoft.com/"
}
@@ -60,6 +60,17 @@ export type CleanerConfig = Array<string | [string, object]>;
export interface ExtractorConfig {
pdf: 'pdfminer' | 'tesseract' | 'abbyy' | 'pdfjs';
img: 'tesseract' | 'abbyy' | 'google-vision' | 'ms-cognitive-services' | 'amazon-textract';
credentials?: {
ABBYY_SERVER_URL?: string; // ABBYY
ABBYY_SERVER_VER?: string; // ABBYY
ABBYY_WORKFLOW?: string; // ABBYY
AWS_ACCESS_KEY_ID?: string; // AWS TEXTRACT
AWS_SECRET_ACCESS_KEY?: string; // AWS TEXTRACT
GOOGLE_APPLICATION_CREDENTIALS?: string; // GOOGLE VISION
OCP_APIM_SUBSCRIPTION_KEY?: string; // MS COGNITIVE SERVICES
OCP_APIM_ENDPOINT?: string; // MS COGNITIVE SERVICES
};

language: TesseractLanguage | TesseractLanguage[];
}

0 comments on commit 92c2d9e

Please sign in to comment.
You can’t perform that action at this time.