In [1]:
import lxml.etree as etree
import pandas as pd
import gzip

In [14]:
fname = "../uspto/ipgb20080909.xl.gz"
parser = etree.XMLParser(remove_blank_text=True)

count = 0
validLineNumbers = []
numLines = 0
numDesigns = 0
citationTypes = []

#TODO: should go over entire dataset (all files) once to determine future size of dataframe, then make it
# disk backed and populate it with another pass

df = None
with gzip.open(fname, 'rb') as fin:
    for (lineNumber, line) in enumerate(fin):
        numLines = numLines + 1
        try:
            root = etree.fromstring(line, parser)
        except etree.XMLSyntaxError:
            print("skipping due to parse error")
 
        applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
        if applicationType == 'design':
            numDesigns = numDesigns + 1
            continue
            
        # get basic data
        date = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
        docNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
        kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
        applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
        inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
        numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text

        # skip entries without abstracts
        abstract = []
        if root.find("abstract") is not None:
            abstract = etree.tostring(root.find("abstract"), encoding='UTF-8', method="text")
            #print(abstract[:20])
            validLineNumbers.append(lineNumber)
            
        referencesCited = root.find("us-bibliographic-data-grant/references-cited")
        
        examinerCitations = []
        applicantCitations = []         
        if referencesCited is not None:
            # remove non-patent citations
            referencesCited = referencesCited.getchildren()
            referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]
            for citation in referencesCited:
                citationDocNum = citation.find("patcit/document-id/doc-number").text
                citationType = etree.tostring(citation.find("category"), method="text")
                citationTypes.append(citationType)
                if citationType == b'cited by examiner':
                    examinerCitations.append(citationDocNum)
                else:
                    applicantCitations.append(citationDocNum)
    
        #add the information to the DataFrame
        if df is None:
            df = pd.DataFrame({'PatentNumber': [docNumber], 
                               'examinerCitations': [examinerCitations], 
                               'applicantCitations': [applicantCitations],
                              'originalFileName': fname,
                              'originalLineNumber': lineNumber}, index=[len(validLineNumbers)-1])
        else:
            df = df.append(pd.DataFrame({'PatentNumber': [docNumber], 
                                         'examinerCitations': [examinerCitations], 
                                         'applicantCitations': [applicantCitations],
                                        'originalFileName': fname, 
                                        'originalLineNumber': lineNumber}, index=[len(validLineNumbers)-1]))


In [15]:
df

Unnamed: 0,PatentNumber,applicantCitations,examinerCitations,originalFileName,originalLineNumber
0,PP019183,[],[],../uspto/ipgb20080909.xl.gz,399
1,PP019184,[],[],../uspto/ipgb20080909.xl.gz,400
2,PP019185,[],[],../uspto/ipgb20080909.xl.gz,401
3,PP019186,[],[],../uspto/ipgb20080909.xl.gz,402
4,PP019187,[],[],../uspto/ipgb20080909.xl.gz,403
5,PP019188,[],[],../uspto/ipgb20080909.xl.gz,404
6,PP019189,[],[],../uspto/ipgb20080909.xl.gz,405
7,PP019190,[],[],../uspto/ipgb20080909.xl.gz,406
8,PP019191,[],[],../uspto/ipgb20080909.xl.gz,407
9,PP019192,[],[],../uspto/ipgb20080909.xl.gz,408


In [251]:
document = root.getchildren()[0]
fields = []
for item in document.iter():
    fields.append(item.tag)
fields = list(set(fields))
fields.sort() 
#etree.tostring(root.find("abstract"), method="text")
#print(fields)
#print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

date = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
docNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text

referencesCited = root.find("us-bibliographic-data-grant/references-cited").getchildren()
# remove non-patent citations
referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]

examinerCitations = []
applicantCitations = []
for citation in referencesCited:
    citationDocNum = citation.find("patcit/document-id/doc-number").text
    citationType = etree.tostring(citation.find("category"), method="text")
    if citationType == b'cited by examiner':
        examinerCitations.append(citationDocNum)
    else:
        applicantCitations.append(citationDocNum)
        
print(examinerCitations)
#[date, docNumber, kind, applicationType, inventionTitle, numberOfClaims]

['5390297', '5832208', '5987525', '6347375', '6385596', '6388957', '6496802', '6577735', '6594686', '6735699', '6766305', '6868405', '6868494', '6873975', '6915425', '7266202', '7349381', '2001/0041989', '2001/0051996', '2002/0023020', '2002/0129140', '2002/0152262']


In [238]:
etree.tostring(referencesCited[42].find("patcit"))

b'<patcit num="00043"><document-id><country>EP</country><doc-number>1 041 767</doc-number><date>20001000</date></document-id></patcit>'

In [245]:
print(etree.tostring(root.find("us-bibliographic-data-grant/references-cited"), pretty_print=True).decode('UTF-8'))

<references-cited>
  <citation>
    <patcit num="00001">
      <document-id>
        <country>US</country>
        <doc-number>4136395</doc-number>
        <kind>A</kind>
        <name>Kolpek et al.</name>
        <date>19790100</date>
      </document-id>
    </patcit>
    <category>cited by other</category>
  </citation>
  <citation>
    <patcit num="00002">
      <document-id>
        <country>US</country>
        <doc-number>5142662</doc-number>
        <kind>A</kind>
        <name>Gump et al.</name>
        <date>19920800</date>
      </document-id>
    </patcit>
    <category>cited by other</category>
  </citation>
  <citation>
    <patcit num="00003">
      <document-id>
        <country>US</country>
        <doc-number>5390297</doc-number>
        <kind>A</kind>
        <name>Barber et al.</name>
        <date>19950200</date>
      </document-id>
    </patcit>
    <category>cited by examiner</category>
    <classification-national>
      <country>US</country>
      <main-classi