-
Notifications
You must be signed in to change notification settings - Fork 301
/
defaultConfig.json
118 lines (118 loc) · 2.09 KB
/
defaultConfig.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
{
"version": 0.9,
"extractor": {
"pdf": "pdfminer",
"ocr": "tesseract",
"language": ["eng", "fra"]
},
"cleaner": [
"drawing-detection",
[
"image-detection",
{
"ocrImages": false
}
],
"out-of-page-removal",
[
"whitespace-removal",
{
"minWidth": 0
}
],
[
"redundancy-detection",
{
"minOverlap": 0.5
}
],
[
"table-detection",
{
"checkDrawings": true,
"runConfig": [
{
"pages": [],
"flavor": "lattice"
}
]
}
],
[
"table-detection-2",
{
"runConfig": [
{
"pages": []
}
]
}
],
[
"header-footer-detection",
{
"ignorePages": [],
"maxMarginPercentage": 8
}
],
"link-detection",
"words-to-line-new",
[
"reading-order-detection",
{
"minVerticalGapWidth": 5,
"minColumnWidthInPagePercent": 15
}
],
[
"lines-to-paragraph",
{
"tolerance": 0.25
}
],
[
"table-of-contents-detection",
{
"pageKeywords": ["pagina", "page", "pag"]
}
],
"ml-heading-detection",
"list-detection",
"page-number-detection",
"hierarchy-detection",
[
"regex-matcher",
{
"isCaseSensitive": true,
"isGlobal": true,
"queries": [
{
"label": "Car",
"regex": "([A-Z]{2}\\-[\\d]{3}\\-[A-Z]{2})"
},
{
"label": "Age",
"regex": "(\\d+)[ -]*(ans|jarige)"
},
{
"label": "Percent",
"regex": "([\\-]?(\\d)+[\\.\\,]*(\\d)*)[ ]*(%|per|percent|pourcent|procent)"
}
]
}
]
],
"output": {
"granularity": "word",
"includeMarginals": false,
"includeDrawings": false,
"formats": {
"json": true,
"text": true,
"csv": true,
"markdown": true,
"pdf": false,
"simpleJson": true
}
}
}