This notebook is the first attempt at OCR'ing US patents. It times the OCR process and creates baseline versions of the text file for US Patents in a parallel fashion.

# Imports

In [1]:
import sys
sys.path.insert(1, '../')
import os
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)
import cv2 
import pytesseract
import numpy as np
from matplotlib import pyplot as plt
import re
import PyPDF2 as pyPdf
import time
import random
import ocrutils
from PIL import Image


# Fetch all TIF's and organize them in a dict

In [None]:
#fetch all paths to tif's
tifs = []
for path, dirs, files in os.walk("/Volumes/Non-Backup_Files/US-patents/"):
    for f in files:
        if f.endswith('tif'):
            tifs.append('{}/{}'.format(path, f))

In [None]:
len(tifs)

In [None]:
patent_dict = {}
for tif_path in tifs:
    if '/00/' in tif_path and ''.join(tif_path.split('/')[-3:-1]).isdigit():
#         print(tif_path)
#         print(int(''.join(tif_path.split('/')[-3:-1])))
        if int(''.join(tif_path.split('/')[-3:-1])) in patent_dict:
            patent_dict[int(''.join(tif_path.split('/')[-3:-1]))].append(tif_path)
        else:
            patent_dict.update({int(''.join(tif_path.split('/')[-3:-1])): [tif_path]})
        

In [5]:
us_patents = list(patent_dict.values())

In [6]:
us_patents[2]

['/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000001.tif',
 '/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000002.tif',
 '/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000003.tif',
 '/Volumes/Non-Backup_Files/US-patents/17900731_18641101_yb2_D00001/00000001-X009741H/00/000/003/00000004.tif']

In [7]:
len(us_patents)

674807

# OCR Functionality

In [23]:
def OCR_US_patent(patent_nb_index, patent_dict, patent_list, output_path='/Volumes/Non-Backup_Files/US-patents/MachineReadableBaseline'):
    """
    function to OCR a US patent
    Args:
    patent_nb_index - the index of the patent number in the patent list
    patent_dict - a dictionary such that the keys are patent numbers and the values are lists of the paths to the tif files of the patent number
    patent_list - a list containg the patent numbers
    output_path - the path to put the output in
    Returns:
    None but the function writes the text file in the output file
    """
    patent_str = ''
    patent_nb = patent_list[patent_nb_index]
    
    for img_index in range(len(patent_dict[patent_nb])):
        img = Image.open(patent_dict[patent_nb][img_index])
        str_from_img = pytesseract.image_to_string(img)
        patent_str = '{}\n{}'.format(patent_str, str_from_img)
        
    with open('{}/{}.txt'.format(output_path, patent_nb), "w") as text_file:
        text_file.write("%s" % patent_str)
        
def OCR_US_patent_list(patent_dict, patent_list, timed=False, output_path='/Volumes/Non-Backup_Files/US-patents/MachineReadableBaseline'):
    """
    function to OCR a list of US patents
    Args:
    patent_dict - a dictionary such that the keys are patent numbers and the values are lists of the paths to the tif files of the patent number
    patent_list - a list containg the patent numbers
    timed - whether to time each OCR process
    output_path - the path to put the output in
    Returns:
    a list of the times it took to OCR each patent if timed is True, None otherwise
    but the function writes the text file in the output file
    """
    if not timed:
        for patent_nb_index in range(len(patent_list)):
            OCR_US_patent(patent_nb_index, patent_dict, patent_list, output_path)
            if(patent_nb_index % 1000 == 0):
                print('finished {}'.format(patent_nb_index))
    else:
        times = []
        for patent_nb_index in range(len(patent_list)):
            start = time.time()
            OCR_US_patent(patent_nb_index, patent_dict, patent_list, output_path)
            end = time.time()
            times.append(end - start)    
            if(patent_nb_index % 1000 == 0):
                print('finished {}'.format(patent_nb_index))
        return times

# OCR on small random sample to get an idea of complexity

In [24]:
test_list = random.sample(patent_dict.keys(), 15)
times = OCR_US_patent_list(patent_dict, test_list, timed=True)

finished 0


In [25]:
sum_times = 0
for i in times:
    sum_times += i
sum_times/len(times) * len(us_patents)

8240856.361843363

We would need about 3 months to cover all patents. Let's parallelize.

# Parallelize

In [2]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  4


In [3]:
pool = mp.Pool(mp.cpu_count())

In [11]:
test_list = random.sample(patent_dict.keys(), 15)
result_objects = [pool.apply_async(ocrutils.OCR_US_patent, args=(i, patent_dict, test_list)) for i in range(len(test_list))]

In [12]:
start = time.time()
[r.get() for r in result_objects]
end = time.time()
delay = end - start

In [29]:
delay/len(test_list) * len(us_patents)

4557169.918809716

In [30]:
delay/len(test_list)

6.753293784459432

It would take about a month and a half to OCR all the patents. Let's take a subset.

# Take random sample of patents

In [13]:
random.seed(42)

In [14]:
seeded_list = random.sample(patent_dict.keys(), 2400)

In [None]:
from shutil import copyfile
for patent_number in seeded_list:
    os.mkdir('/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/{}'.format(patent_number))
    for img_path in patent_dict[patent_number]:
        copyfile(img_path, '/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/{}/{}'.format(patent_number, img_path.split('/')[-1]))
    

In [26]:
#fetch all paths to tif's
seeded_list = []
for path, dirs, files in os.walk("/Volumes/Non-Backup_Files/US-patents/random_sample_seeded"):
    for f in files:
        if f.endswith('tif'):
            seeded_list.append('{}/{}'.format(path, f))

In [4]:
seeded_dict = {}
for patent_nb in os.listdir("/Volumes/Non-Backup_Files/US-patents/random_sample_seeded"):
    for patent_nb_pg in os.listdir("/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/{}".format(patent_nb)):
        if int(patent_nb) in seeded_dict.keys():
            seeded_dict[int(patent_nb)].append("/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/{}/{}".format(patent_nb,patent_nb_pg))
        else:
            seeded_dict.update({int(patent_nb): ["/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/{}/{}".format(patent_nb,patent_nb_pg)]})
                                   

In [5]:
seeded_key_list = list(seeded_dict.keys())

In [6]:
result_objects = [pool.apply_async(ocrutils.OCR_US_patent, args=(i, seeded_dict, seeded_key_list, "/Volumes/Non-Backup_Files/US-patents/random_sample_seeded_txt")) for i in range(len(seeded_key_list))]

In [7]:
start = time.time()
[r.get() for r in result_objects]
end = time.time()
delay = end - start

RuntimeError: Error setting from dictionary

In [9]:
# img = Image.open('/Users/andrealphonse/Downloads/00000002.png')
# str_from_img = pytesseract.image_to_string(img)
# with open('TesseractTest_430/02.txt', "w") as text_file:
#     text_file.write("%s" % str_from_img)
# img = Image.open('/Users/andrealphonse/Downloads/00000001.png')
# str_from_img = pytesseract.image_to_string(img)
# with open('TesseractTest_430/01.txt', "w") as text_file:
#     text_file.write("%s" % str_from_img)

# Tweaking OCR and some NLP trials

In [2]:
import spacy
import neuralcoref
import autocorrect
from autocorrect import Speller
spell = Speller()

In [3]:
# FIG_PATH = '/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/430/00000001.tif'
# TEXT_PATH = '/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/430/00000002.tif'
FIG_PATH = '/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/604/00000001.tif'
TEXT_PATH = '/Volumes/Non-Backup_Files/US-patents/random_sample_seeded/604/00000002.tif'
fig_im = Image.open(FIG_PATH)
text_im = Image.open(TEXT_PATH)
fig_array = np.array(fig_im)
text_array = np.array(text_im)

In [22]:
patent_str = pytesseract.image_to_string(text_array, config='--psm 1')

In [23]:
print(patent_str)

‘

Unrrep STATES

PaTENT OFFICE.

 

WILLIAMS T. SPROUSE, OF SANGAMON, ILLINOIS.

‘IMPROVEMENT IN PLOWS.

Specification forming part of Letters Patent No. 604, dated February 15, 1838.

 

To all whom it may concern:

Be it known that I, WiLL1Ams T, SPROUSE,
of the county of Sangamon and State of Ili-
- nois, have discovered a new and useful Im-
provement in the Manutacture of Plows; and
i do hereby declare that the following is a full
and exact description of said improvement and
of the process of making the same.

The improved: plow is called “Sprouse’s

plow,” and. differs from all other plows in the

manner as wellof making the-irons as of stock-
ing them, the mold-board and bar being made
out of a single piece of iron without welding,
thus: Take a plow-plate, square at one end
and of the proper dimensions—say twelve (12)
inches broad, the upper edge eighteen (18)
inches, and the lower ed geor share twelve (12)
inches long. Draw a diagonal line across said
plate from a poiut on the

In [19]:
# NLP to detect claims

In [24]:
# spell("I'm not sleapy and tehre is no place I'm giong to.")
# "I'm not sleepy and there is no place I'm going to."
patent_str_corr = spell(patent_str)

In [25]:
print(patent_str_corr)

‘

Unfree STATES

PaTENT OFFICE.

 

WILLIAM T. SPROUSE, Of SANGAMON, ILLINOIS.

‘IMPROVEMENT In PLOS.

Specification forming part of Letters Patent No. 604, dated February 15, 1838.

 

To all whom it may concern:

Be it known that I, With1As T, SPROUSE,
of the county of Sangamon and State of Ili-
- ois, have discovered a new and useful Im-
movement in the Manufacture of Flows; and
i do hereby declare that the following is a full
and exact description of said improvement and
of the process of making the same.

The improved: plow is called “Spouse’s

plow,” and. differs from all other flows in the

manner as well making the-irons as of stock-
ing them, the mold-board and bar being made
out of a single piece of iron without welding,
thus: Take a plow-plate, square at one end
and of the proper dimensions—say twelve (12)
inches broad, the upper edge eighteen (18)
inches, and the lower ed geo share twelve (12)
inches long. Draw a diagonal line across said
plate from a point on the upper ed

In [26]:
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)
example_doc = nlp(patent_str_corr)

In [43]:
index = 0
nounIndices = []
for token in example_doc:
    # print(token.text, token.pos_, token.dep_, token.head.text)
#     if token.pos_ == 'PRON':
    if token.pos_ == 'VERB':
        
        nounIndices.append(index)
    index = index + 1

In [56]:
print(example_doc[nounIndices[-5]:nounIndices[-5]+10][6].pos_)

NOUN


In [46]:
for pronoun in nounIndices:
#     if(example_doc[pronoun].text == 'claim'):
    print(example_doc[pronoun])

forming
dated
may
known
discovered
declare
following
said
making
called
making
ing
made
welding
Take
say
share
Draw
said
said
leaving
Cut
fencing
said
Make
said
said
Bend
detached
said
cutting
completed
made
made
stocked
placed
attached
passing
directed
made
said
extended
fastened
extending
dle
fastened
attached
said
known
described
cutting
making


In [10]:
example_doc._.coref_clusters

[The improved: plow: [The plow, The improved: plow],
 the sheth: [the sheth, the sheth],
 one-half the distance: [one-half the distance, them]]

In [11]:
for entity in example_doc.ents:
    print(entity.label_, ' | ', entity.text)

PERSON  |  WILLIAMS T. SPROUSE
GPE  |  SANGAMON
GPE  |  ILLINOIS
ORG  |  PLOWS
WORK_OF_ART  |  Letters Patent No
CARDINAL  |  604
DATE  |  February 15, 1838
PERSON  |  WittiAms T. SpRous
GPE  |  Sangamon
ORG  |  State
GPE  |  the Manutacture of Plows
CARDINAL  |  two
WORK_OF_ART  |  Sprouse
CARDINAL  |  twelve
CARDINAL  |  12
CARDINAL  |  18
CARDINAL  |  twelve
CARDINAL  |  12
PERSON  |  WILLIAMS
CARDINAL  |  six
QUANTITY  |  6) inches
CARDINAL  |  twelve
CARDINAL  |  12
CARDINAL  |  one
QUANTITY  |  half-square
CARDINAL  |  two
CARDINAL  |  one-half
GPE  |  Springfield
DATE  |  November 30, 1836
QUANTITY  |  half-inch
CARDINAL  |  about one and a half
PERSON  |  WALLIAMS T. SPROUSE
PERSON  |  JESSE B. THOMAS,
PERSON  |  JAMES BARKINSON


In [59]:
import nltk
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - '/Users/andrealphonse/nltk_data'
    - '/Users/andrealphonse/anaconda/envs/patentproj/nltk_data'
    - '/Users/andrealphonse/anaconda/envs/patentproj/share/nltk_data'
    - '/Users/andrealphonse/anaconda/envs/patentproj/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
