# Imports

In [27]:
import os
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)
import cv2 
import pytesseract
import numpy as np
from matplotlib import pyplot as plt
import re
import PyPDF2 as pyPdf
import time
import random
import ocrutils
from PIL import Image


# Fetch all TIF's and organize them in a dict

In [2]:
#fetch all paths to tif's
tifs = []
for path, dirs, files in os.walk("/Volumes/Non-Backup_Files/US-patents/"):
    for f in files:
        if f.endswith('tif'):
            tifs.append('{}/{}'.format(path, f))

In [3]:
len(tifs)

2404266

In [4]:
patent_dict = {}
for tif_path in tifs:
    if '/00/' in tif_path and ''.join(tif_path.split('/')[-3:-1]).isdigit():
#         print(tif_path)
#         print(int(''.join(tif_path.split('/')[-3:-1])))
        if int(''.join(tif_path.split('/')[-3:-1])) in patent_dict:
            patent_dict[int(''.join(tif_path.split('/')[-3:-1]))].append(tif_path)
        else:
            patent_dict.update({int(''.join(tif_path.split('/')[-3:-1])): [tif_path]})
        

In [5]:
us_patents = list(patent_dict.values())

In [6]:
us_patents[2]

['/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000001.tif',
 '/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000002.tif',
 '/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000003.tif',
 '/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000004.tif']

In [7]:
len(us_patents)

674807

# OCR Functionality

In [23]:
def OCR_US_patent(patent_nb_index, patent_dict, patent_list):
    patent_str = ''
    patent_nb = patent_list[patent_nb_index]
    
    for img_index in range(len(patent_dict[patent_nb])):
        img = Image.open(patent_dict[patent_nb][img_index])
        str_from_img = pytesseract.image_to_string(img)
        patent_str = '{}\n{}'.format(patent_str, str_from_img)
        
    with open('/Volumes/Non-Backup_Files/US-patents/MachineReadableBaseline/{}.txt'.format(patent_nb), "w") as text_file:
        text_file.write("%s" % patent_str)
        
def OCR_US_patent_list(patent_dict, patent_list, timed=False):
    if not timed:
        for patent_nb_index in range(len(patent_list)):
            OCR_US_patent(patent_nb_index, patent_dict, patent_list)
            if(patent_nb_index % 1000 == 0):
                print('finished {}'.format(patent_nb_index))
    else:
        times = []
        for patent_nb_index in range(len(patent_list)):
            start = time.time()
            OCR_US_patent(patent_nb_index, patent_dict, patent_list)
            end = time.time()
            times.append(end - start)    
            if(patent_nb_index % 1000 == 0):
                print('finished {}'.format(patent_nb_index))
        return times

# OCR on small random sample to get an idea of complexity

In [24]:
test_list = random.sample(patent_dict.keys(), 15)
times = OCR_US_patent_list(patent_dict, test_list, timed=True)

finished 0


In [25]:
sum_times = 0
for i in times:
    sum_times += i
sum_times/len(times) * len(us_patents)

8240856.361843363

We would need about 3 months to cover all patents. Let's parallelize.

# Parallelize

In [8]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  4


In [9]:
pool = mp.Pool(mp.cpu_count())

In [11]:
test_list = random.sample(patent_dict.keys(), 15)
result_objects = [pool.apply_async(ocrutils.OCR_US_patent, args=(i, patent_dict, test_list)) for i in range(len(test_list))]

In [12]:
start = time.time()
[r.get() for r in result_objects]
end = time.time()
delay = end - start

In [29]:
delay/len(test_list) * len(us_patents)

4557169.918809716

In [30]:
delay/len(test_list)

6.753293784459432

It would take about a month and a half to OCR all the patents. Let's take a subset.

# Take random sample of patents

In [32]:
test_list = random.sample(patent_dict.keys(), 4000)
result_objects = [pool.apply_async(ocrutils.OCR_US_patent, args=(i, patent_dict, test_list)) for i in range(len(test_list))]

In [33]:
start = time.time()
[r.get() for r in result_objects]
end = time.time()
delay = end - start

RuntimeError: Error setting from dictionary