This notebook performs the final word tokenization for the LDA Topic Model. It uses cases derived from analyzing the output. Unlike the previous, this is now operating one file at a time. Also, this saves the original bounding box and confidence score of the OCR'd text. This is to help track the translations through the model to a physical location on the pdf.

In [None]:
import json 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pathlib import Path
import regex as re

import sys
sys.path.append(str(Path.cwd().parent))
from utils import create_fh_logger

In [None]:
# locations of json files + a place to store a log
src = Path.cwd().parent.parent / 'processing' / 'nro_declassified' / 'ocr'
dst = src.parent / 'tokenized'
files = list(src.glob('*json'))
logs = src.parent.parent / 'logs'
logs.mkdir(exist_ok=True)
logger = create_fh_logger(logs / "tokenize_ocr.log")

In [None]:
def clean_text(txt):
    """This is put into a function to add additional cleaning mechanisms if needed

    :param txt: _description_
    :type txt: _type_
    :return: _description_
    :rtype: _type_
    """
    txt = re.sub("[^A-Za-z0-9 ]+", '', txt)
    txt = txt.lower()
    return txt
stemmer = SnowballStemmer(language='english')
stopword = stopwords.words('english') # retrieve the stopwords
stopword.extend(['secret', 'fop', 'top', 'classified', 'declassified', 'approved', 'release', 'dod', 'general', 'page', 'via', 'would', 'throughout', 'director', 'chief', 'page'])

In [None]:
for file in files:
    with open(file, 'r') as f:
        data = json.load(f)
    logger.info(f'Analyzing {str(file).encode("utf-8")}')
    data = data[file.stem] # foorgot why but it's a nested json with pdf name as first key
    doc = []
    for pg_num in data.keys():
        words = data[pg_num]['text']
        for i in range(0, len(words)):
            (x, y, w, h) = (data[pg_num]['left'][i], data[pg_num]['top'][i], data[pg_num]['width'][i], data[pg_num]['height'][i])
            conf = data[pg_num]['conf'][i]
            word = clean_text(words[i])
            word = word if word not in stopword else ''
            word = stemmer.stem(word)
            word = word if not all(char == word[0] for char in word) else '' # remove words where the word is the same character
            if word != '' and word != ' ' and len(word) > 2 and word.isalpha():
                item = {'pg': pg_num, 'word': word, 'x': x, 'y': y, 'w': w, 'h': h, 'conf': conf}
                doc.append(item)
    with open(dst / f'{file.stem}.json', 'w') as f:
        json.dump(doc, f)
    logger.info(f'Finished analyzing {str(file).encode("utf-8")}')
