In [2]:
import pandas as pd

In [71]:
df = pd.read_hdf("uspto_grant_data_icpr_section_H.h5")

In [57]:
# Build the inverse lookup tables for row indices in the dataframe based on patent and application document ids
applicationLUT = {}
patentLUT = {}

patentApplicationDocNumbers = df["patentApplicationDocNumber"].tolist()
publishedPatentDocNumbers = df["publishedPatentDocNumber"].tolist()
for (idx, docnum) in enumerate(patentApplicationDocNumbers):
    applicationLUT[docnum] = idx
for (idx, docnum) in enumerate(publishedPatentDocNumbers):
    patentLUT[docnum] = idx

In [118]:
# Example of how to use the LUTs: note the document ids are strings

citationsOfRow1 = set(df.ix[21876]["applicantCitations"])
citationsOfRow1.update(set(df.ix[21876]["examinerCitations"]))
citationsOfRow2 = set(df.ix[119221]["applicantCitations"])
citationsOfRow2.update(set(df.ix[119221]["examinerCitations"]))
citationsOfRow2
fname = df.ix[119221]["originalFileName"]
linenum = df.ix[119221]["originalLineNumber"]

In [122]:
import gzip
import lxml.etree as etree

parser = etree.XMLParser(remove_blank_text=True)
xmlstr = ""
fname = '/Users/gittens/Downloads/uspto-grants/bibliographic_data/ipgb20121030.xl.gz'

with gzip.open(fname, "r") as fin:
    for (linenum, line) in enumerate(fin):
        if linenum == 3458:
            xmlstr = line
            
root = etree.fromstring(xmlstr, parser)
print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

<us-patent-grant lang="EN" dtd-version="v4.2 2006-08-23" file="US08299899-20121030.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20121015" date-publ="20121030">
  <us-bibliographic-data-grant>
    <publication-reference>
      <document-id>
        <country>US</country>
        <doc-number>08299899</doc-number>
        <kind>B2</kind>
        <date>20121030</date>
      </document-id>
    </publication-reference>
    <application-reference appl-type="utility">
      <document-id>
        <country>US</country>
        <doc-number>12874097</doc-number>
        <date>20100901</date>
      </document-id>
    </application-reference>
    <us-application-series-code>12</us-application-series-code>
    <us-term-of-grant>
      <us-term-extension>78</us-term-extension>
      <disclaimer>
        <text>This patent is subject to a terminal disclaimer.</text>
      </disclaimer>
    </us-term-of-grant>
    <classifications-ipcr>
      <classification-ipcr>
        <ipc

In [120]:
linenum

5760

In [116]:
df.ix[119221]

abstract                      A system is provided for identifying implanted...
applicantCitations                   [2006/0050638, 2006/0247684, 2007/0112398]
applicationDate                                                        20100901
applicationType                                                         utility
examinerCitations             [3469191, 3500458, 3500459, 4688213, 4839642, ...
inventionTitle                AIMD external programmer incorporating a multi...
ipcrType                                                                   H04Q
originalFileName              /Users/gittens/Downloads/uspto-grants/ipgb2012...
originalLineNumber                                                         3458
patentApplicationDocNumber                                             12874097
publishedPatentDocNumber                                               08299899
Name: 119221, dtype: object

In [125]:
referencesCited = root.find("us-bibliographic-data-grant/references-cited").getchildren()
referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]

examinerCitations = []
applicantCitations = []
for citation in referencesCited:
    citationCountry = citation.find("patcit/document-id/country").text
    if (citationCountry != "US"):
        continue
    citationDocNum = citation.find("patcit/document-id/doc-number").text
    citationType = etree.tostring(citation.find("category"), method="text")
    if citationType == b'cited by examiner':
        examinerCitations.append(citationDocNum)
    else:
        applicantCitations.append(citationDocNum)
        
examinerCitations

['3469191',
 '3500458',
 '3500459',
 '4688213',
 '4839642',
 '5340361',
 '5423334',
 '7173920',
 '7346120',
 '7383033',
 '7406349',
 '7773691',
 '7826438',
 '2001/0045883',
 '2003/0174048',
 '2004/0030260',
 '2005/0063488',
 '2005/0188277',
 '2005/0195643',
 '2005/0247319',
 '2005/0258242',
 '2006/0076401',
 '2006/0116744',
 '2006/0212096',
 '2007/0167994',
 '2007/0210923',
 '2008/0041929',
 '2008/0048855',
 '2008/0065181',
 '2009/0009336',
 '2010/0007467',
 '2010/0060431',
 '2010/0085160',
 '2010/0106224',
 '2010/0123547',
 '2010/0152816',
 '2010/0161003',
 '2010/0185263',
 '2011/0029043']