This notebook postprocesses the text that is obtained from the OCR. It then proceeds to extract the claims part of each patent through regular expressions.

In [1]:
import re
import os
from autocorrect import Speller
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter
import spacy
spell = Speller()
nlp = spacy.load("en_core_web_sm")
path_to_patent_txts = '/Volumes/Non-Backup_Files/US-patents/random_sample_seeded_txt (new tesseract)'

In [2]:
def get_patent_nbs(path):
    """
    function to get all text file in a given directory (in our case all patent numbers in a given directory)
    Args:
    path - the path to the directory
    Returns:
    patent_nbs - the list of patent numbers
    """
    patent_nbs = []
    for path, dirs, files in os.walk(path):
        for f in files:
            if f.endswith('txt'):
                patent_nbs.append('{}'.format(f.replace('.txt','')))
    return patent_nbs

def read_patent(path, patent_nb):
    """
    function to read a given text file from a path constrcuted by the arguments
    Args:
    path - the path to the text files
    patent_nb - the name of the text file (in our case, the patent number)
    Returns:
    patent_txt - the patent in string form
    """
    with open('{}/{}.txt'.format(path, patent_nb), 'r') as f:
        patent_txt = f.read()
    return patent_txt

def remove_line_nbs(text):
    """
    function to remove the line numbers in a given patent
    Args:
    text - the text with line numbers to be removed
    Returns:
    the text without the line numbers
    """
    return re.sub('\d\d\n', '',text)

patent_nbs = get_patent_nbs(path=path_to_patent_txts)
patent_txts = {patent_nb: remove_line_nbs(read_patent(path=path_to_patent_txts, patent_nb=patent_nb)) for patent_nb in patent_nbs}

# patent_txts = {patent_nb: read_patent(path=path_to_patent_txts, patent_nb=patent_nb) for patent_nb in patent_nbs}

# with open('/Volumes/Non-Backup_Files/GB-patents/seeded_random_sample/ocr_results/430.txt', 'r') as f:
#     patent_example = f.read()
# print(patent_example)

In [3]:
def no_claims_cond(text,spellcheck=False):
    """
    function which evaluates whether the the claims may be fetched using regular expressions
    Args:
    text - the patent in string form
    spellcheck - if True, the text is autocorrected before the claims are fetched
    Returns:
    a boolean indicating whether the regular expression condition is fulfilled or not (False if it is, True if it is not)
    """
    if spellcheck:
        text = spell(text)
    return (not 'I do claim' in text) and (not 'i do claim' in text.lower()) and (not 'Claim' in text) and (not 'we\nclaim' in text.lower()) and (not '[claim' in text.lower()) and (not 'i\nclaim' in text.lower()) and (not 'i claim' in text.lower()) and (not 'we claim' in text.lower()) and (not 'j claim' in text.lower()) and (not 't claim' in text.lower()) and (not 'iclaim' in text.lower()) and (not 'claimed' in text.lower()) and (not '[ claim' in text.lower()) and (not 'claim.' in text.lower() and (not 'claim—' in text.lower()))

In [96]:
#taking some patents to use for autocorrect later on
patent_txt_for_ac = ''
for i in range(5):
    patent_txt_for_ac += spell(patent_txts[patent_nbs[i]])
    patent_txt_for_ac += '\n'

In [97]:
#saving the document to use for autocorrect
with open('patent_txt_for_ac.txt', 'w') as f:
    f.write(patent_txt_for_ac)

In [4]:
#getting all the words from the corrected patent corpus
words = []
with open('patent_txt_for_ac.txt', 'r') as f:
    file_name_data = f.read()
    file_name_data=file_name_data.lower()
    words = re.findall('\w+',file_name_data)
#vocabulary
V = set(words)
print(f"The first ten words in the text are: \n{words[0:10]}")
print(f"There are {len(V)} unique words in the vocabulary.")

The first ten words in the text are: 
['united', 'states', 'patent', 'office', 'j', 'b', 'campbell', 'of', 'cincinnati', 'to']
There are 1017 unique words in the vocabulary.


In [5]:
#getting word frequencies to help with autocorrect
word_freq_dict = {}  
word_freq_dict = Counter(words)
print(word_freq_dict.most_common()[0:10])

[('the', 579), ('of', 236), ('and', 158), ('to', 140), ('in', 122), ('a', 114), ('is', 84), ('as', 66), ('which', 60), ('by', 52)]


In [6]:
#getting the probability distribution on the corpus
probs = {}
Total = sum(word_freq_dict.values())
for k in word_freq_dict.keys():
    probs[k] = word_freq_dict[k]/Total

In [7]:
#word we are interested in
probs['claim']

0.0013798541297062883

In [8]:
def adapted_autocorrect(input_word, V=V, probs=probs, word_freq_dict=word_freq_dict):
    """
    function to autocorrect a given word according to the vocabulary of a corpus
    Args:
    input_word - word to autocorrect
    V - the vocabulary of the corpus to take into account (a set)
    probs - the probabilities of the words as computed from their frequencies in the corpus
    word_freq_dict - a Counter object for the vocabulary
    Returns:
    - 1 if the word is already correct
    output - the most likely candidate of autocorrected words if the word is not in the vocabulary
    """
    input_word = input_word.lower()
    if input_word in V:
        return -1
    else:
        similarities = [1-(textdistance.Jaccard(qval=2).distance(v,input_word)) for v in word_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index':'Word', 0:'Prob'})
        df['Similarity'] = similarities
        output = df.sort_values(['Similarity', 'Prob'], ascending=False).reset_index()['Word'][0]
        return output

In [112]:
#testing claim extraction functions (by regex means before autocorrect)

mysteries = {}
# mysteries_patent_nbs = []
for i in range(len(patent_nbs)):
    if no_claims_cond(patent_txts[patent_nbs[i]]):
#     if (not 'i claim' in patent_txts[i].lower()) and (not 'we claim' in patent_txts[i].lower()) and (not 'j claim' in patent_txts[i].lower()) and (not 't claim' in patent_txts[i].lower()) and (not 'iclaim' in patent_txts[i].lower()) and (not 'claimed' in patent_txts[i].lower()) and (not '[ claim' in patent_txts[i].lower()) and (not 'claim.\n' in patent_txts[i].lower()):
        mysteries.update({patent_nbs[i]: patent_txts[patent_nbs[i]]})
# big_mysteries = []
# for i in range(len(mysteries)):
#     if no_claims_cond(mysteries[i],spellcheck=True):
#         big_mysteries.append(mysteries[i])
    
    

In [113]:
len(keys)

163

In [123]:
keys = list(mysteries.keys())

In [130]:
keys[12]

'181250'

In [134]:
print(mysteries[keys[12]])

UNITED STATES
PATENT OFFICE.
 
JOSEPH W. DE CASTRO, OF NEWARK, NEW JERSEY.
IMPROVEMENT IN MANUFACTURING BOOTS AND SHOES,
Specification forming part of Letters Patent No. I 82,565, dated September 26, 1876; application filed
June 1, 1876.
Lo all whom it may concern:
Be it known that I, JosepH W. DE Castro,
of Newark, in the county of Essex, State
of New Jersey, have invented a certain Im-
provement in Boots and Shoes, of which the
following is a specification :
My invention consists in an improved mode
of stitching on the welt for boots and shoes to
Secure increased strength and protection.
‘Figure 1 is a side elevation of a shoe. Fig.
2 is'a cross-section, showing the relation of
the parts.
My boots or shoes are made by sewing the’
welt to the upper and to the insole in the
ordinary manner, but reversing the position
of the welt so as to attach it to the upper on
the upper edge of the’ welt. instead of the
lower edge, as in the ordinary manner; and
then the welt is turned ontward and u

In [13]:
claim_substrings_lower = ['i do claim','we\nclaim', 'weclaim', '[claim','i\nclaim','i claim','we claim','j claim','t claim','iclaim','claimed','[ claim','claim.','claim—']
claim_substrings_capital = ['Claim', 'I do claim']

claim_substrings_lower_re = [re.escape(exp) for exp in claim_substrings_lower]
claim_substrings_capital_re = [re.escape(exp) for exp in claim_substrings_capital]

In [14]:
def get_claims_for_patent(text, with_autocorrect=False):
    """
    function to extract the claims for a given patent
    Args:
    text - the patent in string form
    with_autocorrect - whether to autocorrect the words before looking for the claims
    Returns:
    the claims in string form
    """
    if not with_autocorrect:
        match = list(re.finditer('|'.join(claim_substrings_lower_re), text.lower()))
        if match != []:
            return text[match[0].start(0):]
        else:
            match = list(re.finditer('|'.join(claim_substrings_capital_re), text))
            if match != []:
                return text[match[0].start(0):]
            else:
                return -1
    else:
        doc = nlp(text)
        for token in doc:
#     print(token.text)
            try:
                autocorrect = adapted_autocorrect(token.text)
            except:
                autocorrect = -1
            if autocorrect != -1:
                if autocorrect == 'claim':
                    return text[token.idx:]
    return -1

In [15]:
def get_all_claims(patent_nbs, patent_txts):
    """
    function to get the claims part of each patent
    Args:
    patent_nbs - a list consisting of the patent numbers
    patent_txts - a dictionnary where the keys are the patent numbers in patent_nbs and the values are the patents in string form
    Returns:
    claims - a dictionnary where the keys are the patent numbers in patent_nbs and the values are the claims part of each patent 
    missing_claim_patents - a dictionnary where the keys are the patent numbers in patent_nbs and the values are the patents where claims couldn't be found in string form
    """
    claims = {}
    missing_claim_patents = {}
    for i in range(len(patent_nbs)):
        patent_nb = patent_nbs[i]
        text = patent_txts[patent_nb]
        claim = get_claims_for_patent(text, with_autocorrect=False)
        if claim == -1:
            claim = get_claims_for_patent(text, with_autocorrect=True)
        if claim == -1:
            missing_claim_patents.update({patent_nb: text})
        else:
            claims.update({patent_nb: claim})
        print('patent {} done. {}/{}'.format(patent_nb, i+1, len(patent_nbs)))
    return claims, missing_claim_patents

In [16]:
claims, missing_claim_patents = get_all_claims(patent_nbs, patent_txts)

patent 100256 done. 1/2400
patent 101494 done. 2/2400
patent 101616 done. 3/2400
patent 10164 done. 4/2400
patent 101653 done. 5/2400
patent 101754 done. 6/2400
patent 101880 done. 7/2400
patent 102124 done. 8/2400
patent 102154 done. 9/2400
patent 102906 done. 10/2400
patent 103442 done. 11/2400
patent 103469 done. 12/2400
patent 104075 done. 13/2400
patent 104140 done. 14/2400
patent 104240 done. 15/2400
patent 104297 done. 16/2400
patent 104424 done. 17/2400
patent 104801 done. 18/2400
patent 105085 done. 19/2400
patent 105314 done. 20/2400
patent 105488 done. 21/2400
patent 105638 done. 22/2400
patent 105658 done. 23/2400
patent 105677 done. 24/2400
patent 106157 done. 25/2400
patent 106304 done. 26/2400
patent 10671 done. 27/2400
patent 107101 done. 28/2400
patent 107427 done. 29/2400
patent 107725 done. 30/2400
patent 107822 done. 31/2400
patent 107825 done. 32/2400
patent 107833 done. 33/2400
patent 108038 done. 34/2400
patent 108050 done. 35/2400
patent 108781 done. 36/2400
pat

patent 173889 done. 294/2400
patent 174291 done. 295/2400
patent 174698 done. 296/2400
patent 174755 done. 297/2400
patent 175048 done. 298/2400
patent 17505 done. 299/2400
patent 176248 done. 300/2400
patent 176608 done. 301/2400
patent 177136 done. 302/2400
patent 1778 done. 303/2400
patent 178243 done. 304/2400
patent 178549 done. 305/2400
patent 179103 done. 306/2400
patent 179760 done. 307/2400
patent 179825 done. 308/2400
patent 180531 done. 309/2400
patent 181250 done. 310/2400
patent 181352 done. 311/2400
patent 18142 done. 312/2400
patent 181903 done. 313/2400
patent 182235 done. 314/2400
patent 182565 done. 315/2400
patent 182789 done. 316/2400
patent 182856 done. 317/2400
patent 183019 done. 318/2400
patent 183282 done. 319/2400
patent 183877 done. 320/2400
patent 183882 done. 321/2400
patent 183903 done. 322/2400
patent 184009 done. 323/2400
patent 184328 done. 324/2400
patent 184480 done. 325/2400
patent 184538 done. 326/2400
patent 184636 done. 327/2400
patent 184739 done

patent 262070 done. 595/2400
patent 262156 done. 596/2400
patent 262361 done. 597/2400
patent 262567 done. 598/2400
patent 262666 done. 599/2400
patent 262949 done. 600/2400
patent 26313 done. 601/2400
patent 263198 done. 602/2400
patent 263221 done. 603/2400
patent 263249 done. 604/2400
patent 26328 done. 605/2400
patent 263743 done. 606/2400
patent 264042 done. 607/2400
patent 264053 done. 608/2400
patent 264666 done. 609/2400
patent 264956 done. 610/2400
patent 265092 done. 611/2400
patent 265175 done. 612/2400
patent 266055 done. 613/2400
patent 266413 done. 614/2400
patent 266757 done. 615/2400
patent 267539 done. 616/2400
patent 267715 done. 617/2400
patent 268173 done. 618/2400
patent 268375 done. 619/2400
patent 268802 done. 620/2400
patent 268849 done. 621/2400
patent 269137 done. 622/2400
patent 269347 done. 623/2400
patent 2694 done. 624/2400
patent 269900 done. 625/2400
patent 27015 done. 626/2400
patent 270211 done. 627/2400
patent 27122 done. 628/2400
patent 271354 done. 

patent 334581 done. 902/2400
patent 334744 done. 903/2400
patent 335197 done. 904/2400
patent 335407 done. 905/2400
patent 335693 done. 906/2400
patent 335820 done. 907/2400
patent 337235 done. 908/2400
patent 337350 done. 909/2400
patent 337396 done. 910/2400
patent 33772 done. 911/2400
patent 337728 done. 912/2400
patent 337992 done. 913/2400
patent 338342 done. 914/2400
patent 338486 done. 915/2400
patent 338544 done. 916/2400
patent 338562 done. 917/2400
patent 338918 done. 918/2400
patent 339492 done. 919/2400
patent 339561 done. 920/2400
patent 340083 done. 921/2400
patent 340490 done. 922/2400
patent 340623 done. 923/2400
patent 340719 done. 924/2400
patent 340728 done. 925/2400
patent 34074 done. 926/2400
patent 3408 done. 927/2400
patent 341132 done. 928/2400
patent 341390 done. 929/2400
patent 341427 done. 930/2400
patent 341569 done. 931/2400
patent 342615 done. 932/2400
patent 343179 done. 933/2400
patent 343313 done. 934/2400
patent 34342 done. 935/2400
patent 343845 done.

patent 415583 done. 1202/2400
patent 415725 done. 1203/2400
patent 415745 done. 1204/2400
patent 415766 done. 1205/2400
patent 416448 done. 1206/2400
patent 416658 done. 1207/2400
patent 417014 done. 1208/2400
patent 417095 done. 1209/2400
patent 417452 done. 1210/2400
patent 417527 done. 1211/2400
patent 41805 done. 1212/2400
patent 418107 done. 1213/2400
patent 418218 done. 1214/2400
patent 418239 done. 1215/2400
patent 418307 done. 1216/2400
patent 418560 done. 1217/2400
patent 418823 done. 1218/2400
patent 418837 done. 1219/2400
patent 419113 done. 1220/2400
patent 41931 done. 1221/2400
patent 419542 done. 1222/2400
patent 41966 done. 1223/2400
patent 419808 done. 1224/2400
patent 419835 done. 1225/2400
patent 420195 done. 1226/2400
patent 420644 done. 1227/2400
patent 420646 done. 1228/2400
patent 420782 done. 1229/2400
patent 420914 done. 1230/2400
patent 420994 done. 1231/2400
patent 421046 done. 1232/2400
patent 421263 done. 1233/2400
patent 421393 done. 1234/2400
patent 421626

patent 482536 done. 1497/2400
patent 482689 done. 1498/2400
patent 482787 done. 1499/2400
patent 482866 done. 1500/2400
patent 482882 done. 1501/2400
patent 483241 done. 1502/2400
patent 483457 done. 1503/2400
patent 48372 done. 1504/2400
patent 484347 done. 1505/2400
patent 484661 done. 1506/2400
patent 484904 done. 1507/2400
patent 484923 done. 1508/2400
patent 484970 done. 1509/2400
patent 485215 done. 1510/2400
patent 485512 done. 1511/2400
patent 485899 done. 1512/2400
patent 485987 done. 1513/2400
patent 48605 done. 1514/2400
patent 487289 done. 1515/2400
patent 487620 done. 1516/2400
patent 487803 done. 1517/2400
patent 487916 done. 1518/2400
patent 487993 done. 1519/2400
patent 488014 done. 1520/2400
patent 488083 done. 1521/2400
patent 488256 done. 1522/2400
patent 488377 done. 1523/2400
patent 488425 done. 1524/2400
patent 488655 done. 1525/2400
patent 489263 done. 1526/2400
patent 489359 done. 1527/2400
patent 489933 done. 1528/2400
patent 490293 done. 1529/2400
patent 49051

patent 555543 done. 1776/2400
patent 555725 done. 1777/2400
patent 556087 done. 1778/2400
patent 557026 done. 1779/2400
patent 557162 done. 1780/2400
patent 557240 done. 1781/2400
patent 557294 done. 1782/2400
patent 557554 done. 1783/2400
patent 557626 done. 1784/2400
patent 557752 done. 1785/2400
patent 557841 done. 1786/2400
patent 557847 done. 1787/2400
patent 557883 done. 1788/2400
patent 559118 done. 1789/2400
patent 559375 done. 1790/2400
patent 559504 done. 1791/2400
patent 559545 done. 1792/2400
patent 559947 done. 1793/2400
patent 56004 done. 1794/2400
patent 560316 done. 1795/2400
patent 560467 done. 1796/2400
patent 560875 done. 1797/2400
patent 560993 done. 1798/2400
patent 561005 done. 1799/2400
patent 561010 done. 1800/2400
patent 562072 done. 1801/2400
patent 562819 done. 1802/2400
patent 563018 done. 1803/2400
patent 563187 done. 1804/2400
patent 563200 done. 1805/2400
patent 563261 done. 1806/2400
patent 563561 done. 1807/2400
patent 563686 done. 1808/2400
patent 5637

patent 629070 done. 2074/2400
patent 629469 done. 2075/2400
patent 629524 done. 2076/2400
patent 62991 done. 2077/2400
patent 630607 done. 2078/2400
patent 630609 done. 2079/2400
patent 63062 done. 2080/2400
patent 632295 done. 2081/2400
patent 632694 done. 2082/2400
patent 632752 done. 2083/2400
patent 632762 done. 2084/2400
patent 632928 done. 2085/2400
patent 633011 done. 2086/2400
patent 633097 done. 2087/2400
patent 63326 done. 2088/2400
patent 633590 done. 2089/2400
patent 634073 done. 2090/2400
patent 634086 done. 2091/2400
patent 635069 done. 2092/2400
patent 635244 done. 2093/2400
patent 635428 done. 2094/2400
patent 635521 done. 2095/2400
patent 635568 done. 2096/2400
patent 635870 done. 2097/2400
patent 636358 done. 2098/2400
patent 63662 done. 2099/2400
patent 636690 done. 2100/2400
patent 636744 done. 2101/2400
patent 636804 done. 2102/2400
patent 636897 done. 2103/2400
patent 637094 done. 2104/2400
patent 637164 done. 2105/2400
patent 637165 done. 2106/2400
patent 63727 d

patent 89729 done. 2361/2400
patent 89989 done. 2362/2400
patent 89990 done. 2363/2400
patent 90033 done. 2364/2400
patent 90622 done. 2365/2400
patent 91092 done. 2366/2400
patent 91171 done. 2367/2400
patent 91381 done. 2368/2400
patent 92275 done. 2369/2400
patent 92558 done. 2370/2400
patent 92631 done. 2371/2400
patent 94027 done. 2372/2400
patent 94644 done. 2373/2400
patent 94732 done. 2374/2400
patent 94905 done. 2375/2400
patent 94912 done. 2376/2400
patent 94992 done. 2377/2400
patent 95552 done. 2378/2400
patent 95886 done. 2379/2400
patent 95904 done. 2380/2400
patent 95993 done. 2381/2400
patent 96357 done. 2382/2400
patent 96399 done. 2383/2400
patent 96446 done. 2384/2400
patent 97015 done. 2385/2400
patent 9741 done. 2386/2400
patent 97487 done. 2387/2400
patent 97810 done. 2388/2400
patent 9791 done. 2389/2400
patent 97973 done. 2390/2400
patent 97995 done. 2391/2400
patent 98030 done. 2392/2400
patent 98160 done. 2393/2400
patent 98203 done. 2394/2400
patent 98229 don

In [17]:
claims = {int(k):v for k,v in claims.items()}
len(missing_claim_patents.keys())

48

In [21]:
to_replace_i = ['[claim','i\nclaim','j claim','t claim','iclaim','[ claim']
to_replace_we = ['we\nclaim', 'weclaim']

to_replace_i_re = [re.escape(exp) for exp in to_replace_i]
to_replace_we_re = [re.escape(exp) for exp in to_replace_we]

In [32]:
def claim_postprocess(claim, with_autocorrect=False):
    """
    function to do additional postprocessing on the claims
    Args:
    claim - the claim in string form
    with_autocorrect - boolean to indicate whether to autocorrect the claims before returning them
    Returns:
    claim - postprocessed claim in string form
    """
    claim = claim.replace('\n', ' ').lower()
    claim = re.sub('|'.join(to_replace_i_re),  'i claim', claim)
    claim = re.sub('|'.join(to_replace_we_re),  'we claim', claim)
    return claim

In [38]:
claims = {k:claim_postprocess(v) for k,v in claims.items()}

In [39]:
autocorrected_claims = {k:claim_postprocess(v, with_autocorrect=True) for k,v in autocorrected_claims.items()}

In [40]:
claims_df = pd.DataFrame.from_dict(claims,orient='index',columns=['Claim Text']).sort_index(ascending=True)
claims_df.head()

Unnamed: 0,Claim Text
430,i claim as my invention and desire to secure b...
1211,i claim as my invention and desire to secure b...
1645,"i claim as my invention, and desire to secure ..."
1778,i claim there- in as constituting my invention...
2265,i claim and for which i solicit an exclusive p...


In [41]:
autocorrected_claims_df = pd.DataFrame.from_dict(autocorrected_claims,orient='index',columns=['Claim Text']).sort_index(ascending=True)
autocorrected_claims_df.head()


Unnamed: 0,Claim Text
430,i claim as my invention and desire to secure b...
1211,i claim as my invention and desire to secure b...
1645,"i claim as my invention, and desire to secure ..."
1778,i claim there- in as constituting my invention...
2265,i claim and for which i solicit an exclusive p...


In [6]:
claims_df = pd.read_pickle('claims.pkl')
autocorrected_claims_df = pd.read_pickle('autocorrected_claims.pkl')

In [11]:
def get_lengths(row):
    """
    function to get the length of the claims and add them to a dataframe
    Args:
    row - the row containing a 'Claim Text' column
    Returns:
    row - the same row with a'Claim Length' column containing the claims length in words
    """
    row['Claim Length'] =  len(nlp(row['Claim Text']))
    return row

In [12]:
claims_df = claims_df.apply(get_lengths,axis=1)
autocorrected_claims_df = autocorrected_claims_df.apply(get_lengths,axis=1)

In [13]:
claims_df.head()

Unnamed: 0,Date,Claim Text,Claim Length,Reference Count
430,1837,i claim as my invention and desire to secure b...,80,{'i': 1}
1211,1839,i claim as my invention and desire to secure b...,113,{'i': 1}
1645,1840,"i claim as my invention, and desire to secure ...",166,"{'i': 2, 'it': 1}"
1778,1840,i claim there- in as constituting my invention...,162,{'i': 2}
2265,1841,i claim and for which i solicit an exclusive p...,1133,"{'i': 6, 'me': 1, 'it': 9, 'they': 3, 'them': 1}"


In [14]:
claims_df.to_pickle('claims.pkl')
autocorrected_claims_df.to_pickle('autocorrected_claims.pkl')

In [50]:
patent_txts['100256']

' \n \nGnited States Patent Office.\nJ. B. CAMPBELL, OF CINCINNATI, OHTO..-\nLetters Patent No. 100,256, dated March 1, 1870.\n——— rt em\n \nIMPROVED SPRING-BED BOTTOM.\ni\nThe Schedule referred to in these Letters Patent and making part of the same.\nTo all whom it may concern:\nBe it known that I, J. B, CAMPBELL, of Cincinnati,\nin the county of Hamilton, and State of Ohio, have\ninvented a new and useful Improvement in Bed-Bot-\ntoms; and i do hereby declare that the following is 2\nfull and exact description thereof, reference being had\nto the accompanying drawings making a part of this\nspecification. ‘\nFirst, I use a steel-wire spring, A, made in the\nshape of a spiral cone, running to a small circumfer-\nence at the top, and the end @ of the wire at the top\nof the spring is turned up vertically about one-quar-\nter of an inch or more, and the end of the wire at the\nbottem or large end of the spring is turned down ver-\ntically about one-half of an inch, or more, making a\nsp

In [52]:
re.findall('\d\d\d\d', '§1234')

['1234']

In [56]:
patent_nbs = [int(patent_nb) for patent_nb in patent_nbs]
patent_txts = {int(k):v for k,v in patent_txts.items()}

In [61]:
patent_nbs = sorted(patent_nbs)

In [58]:
#extract dates
for i in range(len(patent_nbs)):
    matches = re.findall('\d\d\d\d', patent_txts[patent_nbs[i]])
    if not matches:
        print(patent_nbs[i])
    else:
        if(len(matches) > 1):
            print(matches)

['1838', '1836']
['1839', '1111', '1105']
['1840', '1105']
['1841', '1111']
['1842', '1110']
['1848', '1105']
['1844', '1111', '1843']
['1848', '1846', '1846', '1111']
['1848', '1111']
['1849', '1859', '1105', '1913']
['1849', '1111', '1105']
['1849', '1113', '1111']
['1850', '1850']
['1851', '1111']
['1851', '1162', '1111', '1113']
['1851', '1851']
['1850', '1853']
['1856', '1105', '1855']
['1856', '1855', '1844', '1855']
['1856', '1111']
['1857', '1111', '1111']
['1857', '1111', '1113', '1111', '1130', '1111']
['1858', '1111']
['1858', '1858']
['1858', '1111']
['1858', '1111']
['1858', '1858']
['1858', '1105']
['1858', '1111', '1111']
['1859', '1860', '1911']
['1859', '1859']
['1859', '5000']
26328
['1860', '1859']
['1860', '1111']
['1860', '1868']
['1860', '1002']
['1860', '1119', '1860']
['8488', '1861']
32641
['1861', '1864']
['1861', '1861']
['1861', '1860']
['1862', '1857']
['1862', '1861', '1861']
['1863', '1861']
['1863', '1862', '1861']
['1863', '1862']
['1863', '1863']
['186

['1887', '1887']
['1887', '1887']
['1887', '1887']
['1897', '1886']
['1887', '1887', '1111', '1304']
['1887', '1887']
['1887', '1887', '1886']
['1887', '1886', '1002']
['1887', '1885', '1885']
['1286', '1887', '1886']
['1887', '1887']
['1897', '1887']
['1887', '1886']
['1887', '1887']
['1887', '1887']
['1887', '1886', '1111']
['1887', '1887', '1105', '1113']
['1887', '1887', '1887']
['1888', '1887', '1886', '1005']
['1888', '1887']
['1888', '1887']
['1888', '1887']
['1888', '1887']
['1888', '1887']
['1888', '1887', '1888', '1130']
['1888', '1887', '1130', '1130']
['1889', '1987']
['1888', '1887', '1886']
['1888', '1887', '1105']
['1888', '1887', '1887']
['1888', '1887']
['1888', '1887']
['1888', '1887', '1887', '1111', '3038']
['1888', '1111']
['1887', '1887', '1888']
['1888', '1887', '1887', '1113']
['1888', '1887']
['1888', '1888', '1887']
['1888', '1887']
['1888', '1887']
['1888', '1887']
['1888', '1887', '1887']
['1888', '1888']
['1888', '1887', '1887']
['1888', '1887', '1888']
['1

['1892', '1891', '1891']
['1892', '1891', '1891']
['1892', '1891']
['1892', '1891']
['1892', '1891', '1870']
['1892', '1891']
['1892', '1891', '1891']
['1892', '1891']
['1892', '1891']
['1892', '1891', '1005']
['1892', '1891']
['1892', '1890', '1111']
['1892', '1891', '1004']
['1892', '1891']
['1892', '1891', '1891']
['1892', '1891']
['1892', '1891']
['1892', '1891', '1879', '1879']
['1892', '1890', '1888', '1888', '1890', '1890', '1890', '1890', '1891', '1888', '1888', '1890', '1890', '1890', '1890', '1111', '3634', '1111']
['1899', '1892', '1891']
['1892', '1891']
['1892', '1891']
['1892', '1891']
['1892', '1891']
['1882', '1891']
['1892', '1889', '1889']
['1892', '1891']
['1892', '1892']
['1892', '1891']
['1892', '1891']
['1892', '1891']
['1892', '1891', '1891']
['1892', '1891']
['1892', '1892']
['1892', '1892', '1891']
['1892', '1891', '1891']
['1892', '1892']
['1892', '1892']
['1892', '1891']
['1892', '1892']
['1892', '1891', '1891', '1891', '1002', '1111', '1891', '1891']
['1892'

['1900', '1899', '1111', '1111', '1899']
['1900', '1900']
['1900', '1899']
['1900', '1899', '1238', '4567', '1111', '1111', '1113']
['1900', '1898', '1900', '1898']
['1900', '1899']
['1900', '1898']
['1900', '1899']
['1900', '1899']
['1900', '1898', '1111', '1130']
['1900', '1899']
['1900', '1899', '1894', '1130', '6488', '1120', '1130']
['1900', '1900']
['1900', '1899', '1111', '1899', '1111']
['1900', '1899']
['1900', '1899', '1899']
['1900', '1896']
['1900', '1900']
['1900', '1898', '1111', '1898']
['1900', '1898', '1898']
['1900', '1899', '1113']
['1900', '1899', '1897']
['1900', '1898', '1111']
['1900', '1899']
['1900', '1899', '1899', '1111', '1899']
['1900', '1899']
['1900', '1900']
['1900', '1900']
['1900', '1899', '1130', '1900', '1130', '1899']
['1900', '1899']
['1900', '1900', '1900']
['1900', '1900', '1889', '1893']
['1900', '1899', '1899']
['1900', '1900', '1111']
['1900', '1899', '1111']
['1900', '1900', '1899', '1111', '1900']
['1900', '1900']
['1900', '1898']
['1900', '

In [73]:
def get_dates(patent_nbs, patent_txts):
    """
    function to get the dates of patents using regular expressions
    Args:
    patent_nbs - a list containing the patent numbers the dates of which we would like
    patent_txts - a dictionnary containing the patent numbers as keys and the patents in string form as values
    Returns:
    dates - a dictionnary where the keys are the patent numbers present in patent_nbs and the values are the extracted dates
    no_dates - a list of the patent numbers where the dates could not be properly extracted
    """
    no_dates = []
    dates = {}
    for i in range(len(patent_nbs)):
        matches = re.findall('\d\d\d\d', patent_txts[patent_nbs[i]])
        if not matches:
            no_dates.append(patent_nbs[i])
        else:
            added = False
            for j in range(len(matches)):
                match = int(matches[j])
                if (match > 1800 and match < 1901) and not added:
                    dates.update({patent_nbs[i]: match})
                    added = True
            if not added:
                no_dates.append(patent_nbs[i])
    return dates, no_dates
dates, no_dates = get_dates(patent_nbs, patent_txts)

In [74]:
len(no_dates)

18

In [75]:
#manual extraction for the patents where the dates could not be extracted
no_dates

[26328,
 32641,
 47817,
 53153,
 72128,
 79892,
 85378,
 95904,
 135524,
 136744,
 260640,
 291237,
 301845,
 348923,
 671094,
 673677,
 674606,
 675087]

In [76]:
patent_txts['1673']

KeyError: '1673'

260640 is EMPTY!

In [79]:
dates_to_add = {26328 : 1859, 32641 : 1961, 47817 : 1865, 53153 : 1866, 72128 : 1867, 79892 : 1868, 85378 : 1868, 95904 : 1869, 135524 : 1873, 136744 : 1873, 291237 : 1884, 301845 : 1884, 348923 : 1886, 671094 : 1901, 673677 : 1901, 674606 : 1901, 675087 : 1901}
dates.update(dates_to_add)

In [80]:
dates_df = pd.DataFrame.from_dict(dates,orient='index',columns=['Date'])

In [91]:
# autocorrected_claims_df = autocorrected_claims_df.drop(columns=['Date_x','Date_y'])
# claims_df = claims_df.drop(columns=['Date_x','Date_y'])

In [93]:
# merging to add the dates to the claims dataframe
claims_df = dates_df.merge(claims_df,left_index=True, right_index=True)
autocorrected_claims_df = dates_df.merge(autocorrected_claims_df,left_index=True, right_index=True)

claims_df.to_pickle('claims.pkl')
autocorrected_claims_df.to_pickle('autocorrected_claims.pkl')

# SCRAP

In [68]:
print(claims_df[claims_df['Claim Length'] > 1500]['Claim Text'][2265].strip().replace('\n',' '))

I claim and for which I solicit an exclusive property to be secured to me by Letters Patent. My improvements are generally to be con- fined to pavements of wood, but it will be apparent that they are applicable to those formed of prismatic blocks of stone, iron or other suitable materials—and by the term “ prismatic,” as herein applied, it is not in- tended to be understood that the blocks, com- posing the pavements, shall, in a strict geo- metrical sense, be regular prisms throughout their lengths, but that they may approxi- mate thereto, more or less, and in the tech- nical language of mechanics, be known by the above term. By my improved method of shaping and constructing the blocks, and by causing each to be supported by those about it, I am en- abled to give great strength and_ stability to the roadway, and by connecting each of said blocks to the other, on their sides in opposition, a. settlement of the same, and consequent derangement of the traveling surface are prevented, Figu

In [None]:
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# lookup suggestions for single-word input strings
input_term = "apastraphee"  # misspelling of "apostrophe"
# max edit distance per lookup
# (max_edit_distance_lookup <= max_dictionary_edit_distance)
suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST,
                               max_edit_distance=2, include_unknown=True)
# display suggestion term, term frequency, and edit distance
for suggestion in suggestions:
    print(suggestion)

In [None]:
import nltk

word_data = mysteries[keys[12]]
nltk_tokens = nltk.word_tokenize(word_data)
print (nltk_tokens)

In [None]:
for i in range(len(nltk_tokens)):
    try:
        autocorrect = adapted_autocorrect(nltk_tokens[i])
    except:
        autocorrect = 'NO'
    if not isinstance(autocorrect, str):
        if autocorrect.reset_index()['Word'][0] == 'claim':
            print(nltk_tokens[i])

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(word_data)
# for token in doc:
#     print(token.text)

In [None]:
for token in doc:
#     print(token.text)
    try:
        autocorrect = adapted_autocorrect(token.text)
    except:
        autocorrect = -1
    if autocorrect != -1:
        if autocorrect == 'claim':
            print(word_data[token.idx:])

In [None]:
# for i in re.finditer('|'.join(claim_substrings_lower_re), patent_txts[patent_nbs[0]].lower()):
#     print(patent_txts[patent_nbs[0]][i.start(0):])

In [None]:
#         match = list(re.finditer('|'.join(claim_substrings_lower_re), patent_txts[patent_nbs[i]].lower()))
#         if match != []:
#             claims..update({patent_nb: patent_txts[patent_nbs[i]][match[0].start(0):]})
#         else:
#             match = list(re.finditer('|'.join(claim_substrings_capital_re), patent_txts[patent_nbs[i]]))
#             if match != []:
#                 claims.update({patent_nb: patent_txts[patent_nbs[i]][match[0].start(0):]})
#             else:
                