Let's rotate the images and remove impossible to OCR images.

In [13]:
import json 
import numpy as np 
import pandas as pd
from pathlib import Path

import sys
sys.path.append(str(Path.cwd().parent.parent))
from utils import create_fh_logger

In [36]:
# locations of json files 
src = Path.cwd().parent.parent.parent.parent / 'processing' / 'nro_declassified' / 'ocr'
files = list(src.glob('*json'))
output_loc = src.parent / 'good_docs'
output_loc.mkdir(exist_ok=True)

In [7]:
confs = []
for file in files:
    with open(file, 'r') as f:
        data = json.load(f)
    data = data[file.stem] # foorgot why but it's a nested json with pdf name as first key
    for pg_num in data.keys():
        dat = data[pg_num]['conf']
        confs.extend(dat)

In [8]:
import pandas as pd
pd.Series(confs).describe()

count    1.174540e+07
mean     4.790513e+01
std      4.216411e+01
min     -1.000000e+00
25%     -1.000000e+00
50%      4.800000e+01
75%      9.500000e+01
max      9.700000e+01
dtype: float64

So, most of our data isn't acceptable. Is it file based (i.e., we should just remove files) or is it spread throughout?

In [16]:
confs = {}
for file in files:
    with open(file, 'r') as f:
        data = json.load(f)
    data = data[file.stem] # foorgot why but it's a nested json with pdf name as first key
    confs[file.stem] = []
    for pg_num in data.keys():
        dat = data[pg_num]['conf']
        confs[file.stem].extend(dat) 

In [26]:
# let's create descriptive statistics for each file and see where the 50% quartile is at with ocr confidence
bad_docs = []
for key, val in confs.items():
    doc_confs = np.array(val)
    q50 = np.median(doc_confs)
    if q50 < 90:
        bad_docs.append(key)

In [28]:
print(f'There are {len(bad_docs)} bad documents')

There are 1354 bad documents


In [29]:
# let's create descriptive statistics for each file and see where the 50% quartile is at with ocr confidence
bad_docs = []
for key, val in confs.items():
    doc_confs = np.array(val)
    q50 = np.median(doc_confs)
    if q50 < 70:
        bad_docs.append(key)

In [31]:
print(f'There are {len(bad_docs)} awful documents')

There are 952 awful documents


In [32]:
# let's create descriptive statistics for each file and see where the 50% quartile is at with ocr confidence
bad_docs = []
for key, val in confs.items():
    doc_confs = np.array(val)
    q50 = np.median(doc_confs)
    if q50 < 50:
        bad_docs.append(key)
print(f'There are {len(bad_docs)} unsalvageable documents')

There are 681 unsalvageable documents


In [33]:
# let's create descriptive statistics for each file and see where the 50% quartile is at with ocr confidence
bad_docs = []
good_docs = []
for key, val in confs.items():
    doc_confs = np.array(val)
    q50 = np.median(doc_confs)
    if q50 < 90:
        bad_docs.append(key)
    else:
        good_docs.append(key) # let's only keep the good documents

In [39]:
with open(output_loc / 'analyze.json', 'w') as f:
    f.write(json.dumps({'documents': good_docs}))