# USPTO Patent Data Processing

[patent_id, title, date, abstract, kind, withdrawn, assignee_id, inventor_id, citations, claims]

https://docs.google.com/document/d/1xpC1TKHYXSZRjBNxHsy1LEjc3VYJYVsh4EQdVqIxKps/edit

In [3]:
import os
import time
from pprint import pprint
from tqdm import tqdm_notebook

In [4]:
def write_to_tsv(extracted, target):
    target.write(extracted)
    target.write('\n')
    
def init_file(filename):
    if os.path.exists(filename):
        os.remove(filename)
    open(filename, 'a').close()

def check_file(filename):
    with open(filename,'r') as tsv:
        count = 0
        for line in tsv:
            count += 1
            if count == 11: break
            data = line.strip().split('\t')
            print(line)
            
def addDictToLine(line, dictionary):
    return line.replace("\n","") + '\t' + str(dictionary.get(data[0],"")).replace("\n","")

In [3]:
def extract_patent_tsv(data):
    patent_id = data[0]
    date = data[4]
    abstract = data[5]
    title = data[6]
    kind = data[7]
    withdrawn = data[10]
    
    extracted = '\t'.join([patent_id, title, date, abstract, kind, withdrawn])
    return extracted

init_file("patents.tsv")

with open("patents.tsv",'w') as after:
    with open("patent.tsv",'r') as tsv:
        count = 0
        for line in tsv:
            data = line.strip().split('\t')
            patent_type = data[1]
            if patent_type == 'utility':
                extracted = extract_patent_tsv(data)
                write_to_tsv(extracted, after)
                count += 1
                
print(count)

6025637


In [11]:
input_file = "patents.tsv"
output_file = "patents_a.tsv"
read_file = "rawassignee.tsv"

init_file(output_file)

dictionary = dict()
with open(input_file,'r') as before:
    with open(output_file,'w') as after:
        with open(read_file,'r') as tsv:
            for line in tsv:
                data = line.strip().split('\t')
                patent_id = data[1]
                assignee_first_name = data[5]
                assignee_last_name = data[6]
                assignee_person = data[5] + " " + data[6]
                assignee_organization = data[7]
                if len(assignee_person) > len(assignee_organization):
                    dictionary[patent_id] = assignee_person
                else:
                    dictionary[patent_id] = assignee_organization
            count = 0
            for line in before:
                data = line.strip().split('\t')
                newline = addDictToLine(line, dictionary)
                write_to_tsv(newline, after)    
                count += 1
print(count)

6025637


In [12]:
input_file = "patents_a.tsv"
output_file = "patents_i.tsv"
read_file = "rawinventor.tsv"

init_file(output_file)

dictionary = dict()
with open(input_file,'r') as before:
    with open(output_file,'w') as after:
        with open(read_file,'r') as tsv:
            for line in tsv:
                data = line.strip().split('\t')
                patent_id = data[1]
                inventor_name = data[4] + " " + data[5]
                dict_set = dictionary.get(data[0], set())
                dictionary[patent_id] = dict_set.add(inventor_name)
            pprint(len(dictionary.keys()))
            count = 0
            for line in before:
                data = line.strip().split('\t')
                newline = addDictToLine(line, dictionary)
                write_to_tsv(newline, after)    
                count += 1
                
print(count)

6646869
6025637


In [14]:
input_file = "patents_i.tsv"
output_file = "patents_c.tsv"
read_file = "uspatentcitation.tsv"

init_file(output_file)

dictionary = dict()
with open(input_file,'r') as before:
    with open(output_file,'w') as after:
        with open(read_file,'r') as tsv:
            for line in tsv:
                data = line.strip().split('\t')
                patent_id = data[1]
                citation_id = data[2]
                dict_set = dictionary.get(data[0], set())
                dictionary[patent_id] = dict_set.add(citation_id)
            pprint(len(dictionary.keys()))
            count = 0
            for line in before:
                data = line.strip().split('\t')
                newline = addDictToLine(line, dictionary)
                write_to_tsv(newline, after)    
                count += 1
                
print(count)

6316846
6025637


In [30]:
check_file("patents_i.tsv")

# find why top ones don't have inventor and claim

3930271	Golf glove	1976-01-06	 A golf glove is disclosed having an extra finger pocket between the index and middle finger pockets for securing one finger of one hand of a golf player between the fingers of the player's other hand. 	A	0	Hi-Kahng Trading Co.	

3930272	Crib leg lock	1976-01-06	 A lock for a height-adjustable crib or playpen requires two distinct manual operations in order to release the lock on each crib leg. Each lock includes a rigid metal bracket which engages around the extensible part of the leg and is pivotally connected to a crib corner post. The bracket has a nose which projects into one of a series of openings in the extensible part of the leg then the bracket is swung against the leg part to prevent movement of the leg part relative to the post. The upper edge of the bracket is slotted to receive a latch pivotally connected to the corner post just above the bracket. The latch drops into the slot when the nose is engaged in one of the openings so that the bracke

In [19]:
input_file = "patents_c.tsv"
output_file = "patents_cl.tsv"
read_file = "claim.tsv"
input_length = 6025637

init_file(output_file)

with open(input_file,'r') as before:
    with open(output_file,'w') as after:
        with open(read_file,'r') as tsv:
            count = 0
            for patent_line in tqdm_notebook(before, total=input_length):
                patent_data = patent_line.strip().split('\t')
                patent_id = patent_data[0]
                claims = []
                ccount = 0
                for claim_line in tsv:                    
                    ccount += 1
                    if patent_id in claim_line:
                        claim_data = claim_line.strip().split('\t')
                        if patent_id == claim_data[1]:
                            claims.append(claim_data[2])
                print(ccount)
                print(claims)
                
                
                new_data = [] * (len(patent_data)+1)
                new_data[len(new_data)-1] = '\n'.join(claims)
                print(new_data)
                new_line = '\t'.join(new_data) + '\n'
                write_to_tsv(newline, after)
                count += 1

print(count)

HBox(children=(IntProgress(value=0, max=6025637), HTML(value='')))

94162142
['A golf glove in accordance with claim 1 wherein said other finger receptacle is attached along its periphery to said index finger receptacle, said middle finger receptacle and said back surface.', "A golf glove adapted for use on one hand of a golf player comprising at least an index finger receptacle and further comprising a finger receptacle attached to the side of said index finger receptacle most nearly adjacent to the thumb and adapted to receive a finger of the golf player's other hand.", 'A glove comprising an index finger receptacle, a middle finger receptacle, a back surface extending in the direction of the wrist, a finger restraining strap extending between said receptacles, and a finger restraining strap attached to said back surface adjacent said index finger receptacle and said middle finger receptacle and aligned in a direction substantially parallel to said finger restraining strap extending between said finger receptacles.', 'A golf glove comprising at least

IndexError: list assignment index out of range

In [20]:
input_file = "patents_cl.tsv"
output_file = "patents_s.tsv"
read_file = "brf_sum_text.tsv"
input_length = 6025637

init_file(output_file)

with open(input_file,'r') as before:
    with open(output_file,'w') as after:
        with open(read_file,'r') as tsv:
            count = 0
            for patent_line in tqdm_notebook(before, total=input_length):
                patent_data = patent_line.strip().split('\t')
                patent_id = patent_data[0]
                for sum_line in tsv:                    
                    if patent_id in sum_line:
                        sum_data = sum_line.strip().split('\t')
                        if patent_id == sum_data[1]:
                            summary = sum_data[2]
                new_line = addTextToLine(summary, patent_line)
                write_to_tsv(newline, after)
                count += 1

print(count)

HBox(children=(IntProgress(value=0, max=6025637), HTML(value='')))

0
