In [1]:
import lxml.etree as etree
import pandas as pd
from os import listdir
import gzip

In [30]:
basedir = "/Users/gittens/Downloads/uspto-grants/bibliographic_data"
flist = [basedir + "/" + fname for fname in listdir(basedir)]
parser = etree.XMLParser(remove_blank_text=True)

#TODO: should go over entire dataset (all files) once to determine future size of dataframe, then make it
# disk backed and populate it with another pass

skippedCount = 0
patentApplicationDocNumbers = []
applicationDates = []
applicationTypes = []
publishedPatentDocNumbers = []
patentPublicationDates = []
ipcrTypes = []
inventionTitles = []
abstracts = []
examinerCitationsList = []
applicantCitationsList = []
originalFileNames = []
originalLineNumbers = []
                           
for (filenum, fname) in enumerate(flist):
    with gzip.open(fname, 'rb') as fin:
        print("on file {0} of {1}, have {2} patents so far".format(filenum+1, len(flist), len(patentApplicationDocNumbers)))
        for (lineNumber, line) in enumerate(fin):
            try:
                root = etree.fromstring(line, parser)
            except etree.XMLSyntaxError:
                skippedCount = skippedCount + 1
                continue
                
            try:
                ipcrSection = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/section")
                ipcrClass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/class")
                ipcrSubclass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/subclass")
                ipcrType = (ipcrSection.text + ipcrClass.text + ipcrSubclass.text)    
            except AttributeError:
                ipcrType = 'Missing'
                
            applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
                
            # get basic data
            applicationDate = root.find("us-bibliographic-data-grant/application-reference/document-id/date").text
            patentPublicationDate = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
            publishedPatentDocNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
            patentApplicationDocNumber = root.find("us-bibliographic-data-grant/application-reference/document-id/doc-number").text
            kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
       
            applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
            inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
            numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text
            
            # give entries without abstracts empty abstracts
            abstract = ""
            if root.find("abstract") is not None:
                abstract = etree.tostring(root.find("abstract"), encoding='UTF-8', method="text").decode('UTF-8')
                #print(abstract[:20])
   
            referencesCited = root.find("us-bibliographic-data-grant/references-cited")
        
            examinerCitations = []
            applicantCitations = []         
            if referencesCited is not None:
                # remove non-patent citations
                referencesCited = referencesCited.getchildren()
                referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]
                for citation in referencesCited:
                    citationCountry = citation.find("patcit/document-id/country").text
                    if (citationCountry != "US"):
                        continue
                    citationDocNum = citation.find("patcit/document-id/doc-number").text
                    citationType = etree.tostring(citation.find("category"), method="text")
                    if citationType == b'cited by examiner':
                        examinerCitations.append(citationDocNum)
                    else:
                        applicantCitations.append(citationDocNum)
    
            #add the information to the DataFrame
            patentApplicationDocNumbers.append(patentApplicationDocNumber)
            applicationDates.append(applicationDate)
            applicationTypes.append(applicationType)
            publishedPatentDocNumbers.append(publishedPatentDocNumber)
            patentPublicationDates.append(patentPublicationDate)
            ipcrTypes.append(ipcrType)
            inventionTitles.append(inventionTitle)
            abstracts.append(abstract)
            examinerCitationsList.append(examinerCitations)
            applicantCitationsList.append(applicantCitations)
            originalFileNames.append(fname)
            originalLineNumbers.append(lineNumber)
            
df = pd.DataFrame.from_dict({'patentApplicationDocNumber': patentApplicationDocNumbers,
                             'applicationDate': applicationDates,
                             'applicationType': applicationTypes,
                            'publishedPatentDocNumber': publishedPatentDocNumbers,
                            'ipcrType': ipcrTypes,
                            'inventionTitle' : inventionTitles,
                            'abstract': abstracts,
                            'examinerCitations': examinerCitationsList,
                            'applicantCitations': applicantCitationsList,
                            'originalFileName': originalFileNames,
                            'originalLineNumber': originalLineNumbers})

return df

on file 1 of 767, have 0 patents
on file 2 of 767, have 3015 patents
on file 3 of 767, have 6022 patents
on file 4 of 767, have 8626 patents
on file 5 of 767, have 11253 patents
on file 6 of 767, have 14280 patents
on file 7 of 767, have 17309 patents
on file 8 of 767, have 20335 patents
on file 9 of 767, have 23148 patents
on file 10 of 767, have 25977 patents
on file 11 of 767, have 29000 patents


KeyboardInterrupt: 

In [29]:
df.to_pickle("uspto_grant_data_abouthalf.pickle")

In [68]:
document = root.getchildren()[0]
fields = []
for item in document.iter():
    fields.append(item.tag)
fields = list(set(fields))
fields.sort() 
#etree.tostring(root.find("abstract"), method="text")
#print(fields)
#print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

date = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
publishedPatentDocNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
applicationPatentDocNumber = root.find("us-bibliographic-data-grant/application-reference/document-id/doc-number").text
kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text

referencesCited = root.find("us-bibliographic-data-grant/references-cited").getchildren()
# remove non-patent citations
referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]

examinerCitations = []
applicantCitations = []
for citation in referencesCited:
    citationCountry = citation.find("patcit/document-id/country").text
    if (citationCountry != "US"):
        continue
    citationDocNum = citation.find("patcit/document-id/doc-number").text
    citationType = etree.tostring(citation.find("category"), method="text")
    if citationType == b'cited by examiner':
        examinerCitations.append(citationDocNum)
    else:
        applicantCitations.append(citationDocNum)
        
print(examinerCitations)
#[date, docNumber, kind, applicationType, inventionTitle, numberOfClaims]

['2005/0246536', '2006/0020830', '2006/0059349', '2006/0095775']


In [26]:
df = pd.DataFrame.from_dict({'patentApplicationDocNumber': patentApplicationDocNumbers,
                             'applicationDate': applicationDates,
                             'applicationType': applicationTypes,
                            'publishedPatentDocNumber': publishedPatentDocNumbers,
                            'ipcrType': ipcrTypes,
                            'inventionTitle' : inventionTitles,
                            'abstract': abstracts,
                            'examinerCitations': examinerCitationsList,
                            'applicantCitations': applicantCitationsList,
                            'originalFileName': originalFileNames,
                            'originalLineNumber': originalLineNumbers})
d

Unnamed: 0,abstract,applicantCitations,applicationDate,applicationType,examinerCitations,inventionTitle,ipcrType,originalFileName,originalLineNumber,patentApplicationDocNumber,publishedPatentDocNumber
0,A process for the preparation of an ink which ...,"[4463359, 4840674, 4853036, 5021802, 5041161, ...",19990816,sir,"[4471079, 4812492, 4820346, 5336711, 5385957, ...",Ink compositions,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,180,09375031,H0002113
1,A new and distinct cultivar of New Guinea Impa...,[],20030603,plant,[],New Guinea Impatiens plant named ‘Fisnics Swee...,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,181,10452973,PP015460
2,A new mandarin hybrid called ‘TDE2’ is disting...,[],20020620,plant,[],Mandarin hybrid tree named ‘TDE2’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,182,10178000,PP015461
3,A new cultivar of Impatiens plant named ‘Tamar...,[],20031031,plant,[],Impatiens plant named ‘Tamar Purple Bicolor’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,183,10698695,PP015462
4,A new variety of Chrysanthemum plant named ‘Sw...,[PP12991],20030211,plant,[],Chrysanthemum plant named ‘Swing Time Yellow’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,184,10365163,PP015463
5,A new variety of Chrysanthemum plant named ‘Vu...,[],20030211,plant,[],Chrysanthemum plant named ‘Vulcan Time Bronze’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,185,10365162,PP015464
6,A new and distinct Gala-type apple cultivar is...,[],20030811,plant,[],Apple tree named ‘Dalitoga’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,186,10639179,PP015465
7,A new and distinct cultivar of Chrysanthemum p...,[],20040225,plant,[],Chrysanthemum plant named ‘Yellow Yowoodstock’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,187,10787049,PP015466
8,"A new garden rose plant which has abundant, li...",[],20030113,plant,[],Hybrid Tea rose variety ‘POULen003’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,188,10341892,PP015467
9,A new and distinct grapevine variety character...,[],20030710,plant,[],Grapevine named ‘Sugratwentyseven’,Missing,/Users/gittens/Downloads/uspto-grants/ipgb2005...,189,10618503,PP015468


In [8]:
print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

<us-patent-grant lang="EN" dtd-version="v40 2004-12-02" file="USH0002113-20050104.XML" status="SAMPLE-DATA" id="us-patent-grant" country="US" date-produced="20041221" date-publ="20050104">
  <us-bibliographic-data-grant>
    <publication-reference>
      <document-id>
        <country>US</country>
        <doc-number>H0002113</doc-number>
        <kind>H1</kind>
        <date>20050104</date>
      </document-id>
    </publication-reference>
    <us-sir-flag sir-text="A statutory invention registration is not a patent. It has the defensive attributes of a patent but does not have the enforceable attributes of a patent. No article or advertisement or the like may use the term patent, or any term suggestive of a patent, when referring to a statutory invention registration. For more specific information on the rights associated with a statutory invention registration see 35 U.S.C.157."/>
    <application-reference appl-type="sir">
      <document-id>
        <country>US</country>
        <