In [6]:
import lxml.etree as etree
import pandas as pd
from os import listdir
import gzip

In [8]:
basedir = "/Users/gittens/Downloads/uspto-grants"
flist = [basedir + "/" + fname for fname in listdir(basedir)]
flist = flist[53:]
parser = etree.XMLParser(remove_blank_text=True)

#TODO: should go over entire dataset (all files) once to determine future size of dataframe, then make it
# disk backed and populate it with another pass

citationTypes = set([])
patentTypes = set([])
numLines = 0
numDesigns = 0
numWithoutClassification = 0

patentApplicationDocNumbers = []
applicationDates = []
applicationTypes = []
publishedPatentDocNumbers = []
patentPublicationDates = []
ipcrTypes = []
inventionTitles = []
abstracts = []
examinerCitationsList = []
applicantCitationsList = []
originalFileNames = []
originalLineNumbers = []
skippedCount = 0
                                
for (filenum, fname) in enumerate(flist):
    with gzip.open(fname, 'rb') as fin:
        print("on file {0} of {1}, have {2} patents".format(filenum+1, len(flist), len(patentApplicationDocNumbers)))
        for (lineNumber, line) in enumerate(fin):
            numLines = numLines + 1
            try:
                root = etree.fromstring(line, parser)
            except etree.XMLSyntaxError:
                skippedCount = skippedCount + 1
                continue
                
            try:
                ipcrSection = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/section")
                ipcrClass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/class")
                ipcrSubclass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/subclass")
                ipcrType = (ipcrSection.text + ipcrClass.text + ipcrSubclass.text)
                #patentTypes.add(ipcrType)

                if ((ipcrSection.text != 'H') or (ipcrClass.text != '04')):
                    continue     
            except AttributeError:
                ipcrType = 'Missing'
                continue
                
            applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
            if applicationType == 'design':
                numDesigns = numDesigns + 1
                continue
                
            # get basic data
            applicationDate = root.find("us-bibliographic-data-grant/application-reference/document-id/date").text
            patentPublicationDate = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
            publishedPatentDocNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
            patentApplicationDocNumber = root.find("us-bibliographic-data-grant/application-reference/document-id/doc-number").text
            kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
       
            applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
            inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
            numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text
            
            # give entries without abstracts empty abstracts
            abstract = ""
            if root.find("abstract") is not None:
                abstract = etree.tostring(root.find("abstract"), encoding='UTF-8', method="text").decode('UTF-8')
                #print(abstract[:20])
   
            referencesCited = root.find("us-bibliographic-data-grant/references-cited")
        
            examinerCitations = []
            applicantCitations = []         
            if referencesCited is not None:
                # remove non-patent citations
                referencesCited = referencesCited.getchildren()
                referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]
                for citation in referencesCited:
                    citationDocNum = citation.find("patcit/document-id/doc-number").text
                    citationType = etree.tostring(citation.find("category"), method="text")
                    #citationTypes.add(citationType)
                    if citationType == b'cited by examiner':
                        examinerCitations.append(citationDocNum)
                    else:
                        applicantCitations.append(citationDocNum)
    
            #add the information to the DataFrame
            patentApplicationDocNumbers.append(patentApplicationDocNumber)
            applicationDates.append(applicationDate)
            applicationTypes.append(applicationType)
            publishedPatentDocNumbers.append(publishedPatentDocNumber)
            patentPublicationDates.append(patentPublicationDate)
            ipcrTypes.append(ipcrType)
            inventionTitles.append(inventionTitle)
            abstracts.append(abstract)
            examinerCitationsList.append(examinerCitations)
            applicantCitationsList.append(applicantCitations)
            originalFileNames.append(fname)
            originalLineNumbers.append(lineNumber)
            
df = pd.DataFrame.from_dict({'patentApplicationDocNumber': patentApplicationDocNumbers,
                             'applicationDate': applicationDates,
                             'applicationType': applicationTypes,
                            'publishedPatentDocNumber': publishedPatentDocNumbers,
                            'ipcrType': ipcrTypes,
                            'inventionTitle' : inventionTitles,
                            'abstract': abstracts,
                            'examinerCitations': examinerCitationsList,
                            'applicantCitations': applicantCitationsList,
                            'originalFileName': originalFileNames,
                            'originalLineNumber': originalLineNumbers})

on file 1 of 556, have 0 patents
on file 2 of 556, have 291 patents
on file 3 of 556, have 490 patents
on file 4 of 556, have 728 patents
on file 5 of 556, have 1049 patents
on file 6 of 556, have 1344 patents
on file 7 of 556, have 1670 patents
on file 8 of 556, have 2002 patents
on file 9 of 556, have 2357 patents
on file 10 of 556, have 2688 patents
on file 11 of 556, have 2921 patents
on file 12 of 556, have 3276 patents
on file 13 of 556, have 3620 patents
on file 14 of 556, have 3974 patents
on file 15 of 556, have 4335 patents
on file 16 of 556, have 4734 patents
on file 17 of 556, have 5100 patents
on file 18 of 556, have 5466 patents
on file 19 of 556, have 5833 patents
on file 20 of 556, have 6188 patents
on file 21 of 556, have 6548 patents
on file 22 of 556, have 6896 patents
on file 23 of 556, have 7243 patents
on file 24 of 556, have 7576 patents
on file 25 of 556, have 7868 patents
on file 26 of 556, have 8253 patents
on file 27 of 556, have 8599 patents
on file 28 of 55

KeyboardInterrupt: 

In [11]:
df.to_json("uspto_grant_data_subset.json")

In [9]:
df = pd.DataFrame.from_dict({'patentApplicationDocNumber': patentApplicationDocNumbers,
                             'applicationDate': applicationDates,
                             'applicationType': applicationTypes,
                            'publishedPatentDocNumber': publishedPatentDocNumbers,
                            'ipcrType': ipcrTypes,
                            'inventionTitle' : inventionTitles,
                            'abstract': abstracts,
                            'examinerCitations': examinerCitationsList,
                            'applicantCitations': applicantCitationsList,
                            'originalFileName': originalFileNames,
                            'originalLineNumber': originalLineNumbers})

In [68]:
document = root.getchildren()[0]
fields = []
for item in document.iter():
    fields.append(item.tag)
fields = list(set(fields))
fields.sort() 
#etree.tostring(root.find("abstract"), method="text")
#print(fields)
#print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

date = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
publishedPatentDocNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
applicationPatentDocNumber = root.find("us-bibliographic-data-grant/application-reference/document-id/doc-number").text
kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text

referencesCited = root.find("us-bibliographic-data-grant/references-cited").getchildren()
# remove non-patent citations
referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]

examinerCitations = []
applicantCitations = []
for citation in referencesCited:
    citationDocNum = citation.find("patcit/document-id/doc-number").text
    citationType = etree.tostring(citation.find("category"), method="text")
    if citationType == b'cited by examiner':
        examinerCitations.append(citationDocNum)
    else:
        applicantCitations.append(citationDocNum)
        
print(examinerCitations)
#[date, docNumber, kind, applicationType, inventionTitle, numberOfClaims]

['2005/0246536', '2006/0020830', '2006/0059349', '2006/0095775']


In [238]:
etree.tostring(referencesCited[42].find("patcit"))

b'<patcit num="00043"><document-id><country>EP</country><doc-number>1 041 767</doc-number><date>20001000</date></document-id></patcit>'

In [8]:
print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

<us-patent-grant lang="EN" dtd-version="v40 2004-12-02" file="USH0002113-20050104.XML" status="SAMPLE-DATA" id="us-patent-grant" country="US" date-produced="20041221" date-publ="20050104">
  <us-bibliographic-data-grant>
    <publication-reference>
      <document-id>
        <country>US</country>
        <doc-number>H0002113</doc-number>
        <kind>H1</kind>
        <date>20050104</date>
      </document-id>
    </publication-reference>
    <us-sir-flag sir-text="A statutory invention registration is not a patent. It has the defensive attributes of a patent but does not have the enforceable attributes of a patent. No article or advertisement or the like may use the term patent, or any term suggestive of a patent, when referring to a statutory invention registration. For more specific information on the rights associated with a statutory invention registration see 35 U.S.C.157."/>
    <application-reference appl-type="sir">
      <document-id>
        <country>US</country>
        <

In [18]:
ipcrSection = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/section")
ipcrClass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/class")
ipcrSubclass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/subclass")
patentType = ipcrSection.text + ipcrClass.text + ipcrSubclass.text

In [22]:
ipcrSection.text + ipcrClass.text + ipcrSubclass.text

'H04L'

In [52]:
100 % 4

0