In [1]:
import topycal
import os
import glob
import random

INSTR_PATH = os.path.join(os.getcwd(),"afi_txt")

In [2]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
def do_fn_on_iter(fn, iterator, num_threads=6):
    futures = []
    if isinstance(num_threads, str):
        num_threads = int(num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for elem in iterator:
            futures.append(executor.submit(fn, elem))
    results = []
    for x in as_completed(futures):
        results.append(x.result())
    return results


In [3]:
def get_file_list(limit=500, shuffle=True):
    files = glob.glob("{}/*.txt".format(INSTR_PATH))
    if shuffle:
        random.shuffle(files)
    if limit:
        return files[0:limit]
    else:
        return files
    #data = myfile.read()
    
def read_file(fname):
    with open(fname, errors='replace') as fd:
        return fd.read()

In [4]:
file_list = get_file_list(limit=None)


In [5]:
file_list[3]

'/home/brian/afi_redux/afi_txt/afmd16.txt'

In [6]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: set fileencoding=utf8 :

import re
# Use pdfminer.six 
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def convert_pdf_to_txt(inpath, outpath=None):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(inpath, 'rb') as fd:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fd, pagenos, maxpages=maxpages,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        device.close()
        str = retstr.getvalue()
        retstr.close()
        output = str.replace('\n', '')
    if outpath:
        with open(outpath, 'w') as outf:
            outf.write(output)
        return outpath
    else:
        return output
    


In [7]:
import os

def get_out_name(in_name, out_path):
    fname = os.path.splitext(os.path.basename(in_name))[0]+".txt"
    return os.path.join(out_path, fname)


In [8]:
get_out_name(file_list[3], "/home/brian/afi_redux/afi_txt")

'/home/brian/usaf_instructiondestruction/afi_txt/afmd16.txt'

In [15]:
def do_conversion(fname, out_prefix="/home/brian/afi_redux/afi_txt"):
    outfname = get_out_name(fname, out_prefix)
    #print(outfname)
    if not os.path.isfile(outfname):
        try:
            return convert_pdf_to_txt(fname, outfname)
        except Exception as err:
            print("While processing {}, encountered {}".format(fname,repr(err)))
    else:
        return "Skipping {}, exists".format(outfname)
    

In [16]:
#from multiprocessing.dummy import Pool
#pool = Pool(6)
#pool.map(do_conversion, file_list)

In [17]:
do_fn_on_iter(do_conversion, file_list)

['Skipping /home/brian/afi_redux/afi_txt/afi65-503.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afi36-1901.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afi17-212.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afm19-10.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/cfetp1u0x1.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afi11-2hh-60v1.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afpam36-3210.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afi36-2639.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afttp3-2.46_ip.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afman48-125.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afi11-5ftv1.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/cfetp3e8x1.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/hoi36-3.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afh32-7084.txt, exists',
 'Skipping /home/brian/afi_redux/afi_txt/afi11-5ftv2.txt, exists',
 'Sk