In [65]:
import lxml.etree as etree
import pandas as pd
import gzip

In [80]:
fname = "../uspto/ipgb20081118.xl.gz"
ipcrSectionDesired = 'H'
ipcrClassDesired = '04'
parser = etree.XMLParser(remove_blank_text=True)

count = 0
validLineNumbers = []
numLines = 0
numDesigns = 0
citationTypes = set([])
patentTypes = set([])

#TODO: should go over entire dataset (all files) once to determine future size of dataframe, then make it
# disk backed and populate it with another pass

df = None
with gzip.open(fname, 'rb') as fin:
    for (lineNumber, line) in enumerate(fin):
        numLines = numLines + 1
        try:
            root = etree.fromstring(line, parser)
        except etree.XMLSyntaxError:
            print("skipping due to parse error")
 
        applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
        if applicationType == 'design':
            numDesigns = numDesigns + 1
            continue
            
        # get basic data
        applicationDate = root.find("us-bibliographic-data-grant/application-reference/document-id/date").text
        patentPublicationDate = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
        publishedPatentDocNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
        patentApplicationDocNumber = root.find("us-bibliographic-data-grant/application-reference/document-id/doc-number").text
        kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
       
        applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
        inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
        numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text
        ipcrSection = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/section")
        ipcrClass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/class")
        ipcrSubclass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/subclass")
        ipcrType = (ipcrSection.text + ipcrClass.text + ipcrSubclass.text)
        patentTypes.add(ipcrType)
        
        # filter out patents that aren't of the kind we want
        if ((ipcrSection.text != ipcrSectionDesired) or (ipcrClass.text != ipcrClassDesired)) :
                continue
        validLineNumbers.append(lineNumber)     
        
        # give entries without abstracts empty abstracts
        abstract = []
        if root.find("abstract") is not None:
            abstract = etree.tostring(root.find("abstract"), encoding='UTF-8', method="text").decode('UTF-8')
            #print(abstract[:20])
   
        referencesCited = root.find("us-bibliographic-data-grant/references-cited")
        
        examinerCitations = []
        applicantCitations = []         
        if referencesCited is not None:
            # remove non-patent citations
            referencesCited = referencesCited.getchildren()
            referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]
            for citation in referencesCited:
                citationDocNum = citation.find("patcit/document-id/doc-number").text
                citationType = etree.tostring(citation.find("category"), method="text")
                citationTypes.add(citationType)
                if citationType == b'cited by examiner':
                    examinerCitations.append(citationDocNum)
                else:
                    applicantCitations.append(citationDocNum)
    
        #add the information to the DataFrame
        if df is None:
            df = pd.DataFrame({'patentApplicationDocNumber': patentApplicationDocNumber,
                               'applicationDate': applicationDate,
                               'applicationType': applicationType,
                               'publishedPatentDocNumber': publishedPatentDocNumber, 
                               'patentPublicationDate': patentPublicationDate,
                               'ipcrType': ipcrType,
                               'inventionTitle': inventionTitle,
                               'abstract': abstract,
                               'examinerCitations': [examinerCitations], 
                               'applicantCitations': [applicantCitations],
                               'originalFileName': fname,
                               'originalLineNumber': lineNumber}, index=[len(validLineNumbers)-1])
        else:
            df = df.append(pd.DataFrame({'patentApplicationDocNumber': patentApplicationDocNumber,
                               'applicationDate': applicationDate,
                               'applicationType': applicationType,
                               'publishedPatentDocNumber': publishedPatentDocNumber, 
                               'patentPublicationDate': patentPublicationDate,
                               'ipcrType': ipcrType,
                               'inventionTitle': inventionTitle,
                               'abstract': abstract,
                               'examinerCitations': [examinerCitations], 
                               'applicantCitations': [applicantCitations],
                               'originalFileName': fname,
                               'originalLineNumber': lineNumber}, index=[len(validLineNumbers)-1]))

In [81]:
df

Unnamed: 0,abstract,applicantCitations,applicationDate,applicationType,examinerCitations,inventionTitle,ipcrType,originalFileName,originalLineNumber,patentApplicationDocNumber,patentPublicationDate,publishedPatentDocNumber
0,A key fob for housing at least one electrical ...,"[5388691, 5850754, 6533111, 2004/0033788, 2005...",20050103,utility,"[6669017, 6948614, 7142413, 2002/0008610]",Key fob with detent mechanism,H04Q,../uspto/ipgb20081118.xl.gz,2374,11028095,20081118,07453346
1,A system for displaying an information package...,[],20050215,utility,"[5796351, 5914671, 6285963, 6513015, 6662215, ...",System for displaying an information package,H04Q,../uspto/ipgb20081118.xl.gz,2375,11058334,20081118,07453347
2,A data call between at least two internetworke...,"[5689553, 5742670, 5764916, 5949763, 6097793, ...",20050722,utility,"[6535909, 7330875]",Sharing of prerecorded motion video over an In...,H04N,../uspto/ipgb20081118.xl.gz,2515,11187067,20081118,07453488
3,An image processing apparatus of the present i...,"[0 830 034, 0 953 962, 1-317091, 9-73049, 9-27...",20040206,utility,"[5937212, 6392690, 6549650, 6950535, 2003/0011...","Image processing apparatus, image pickup syste...",H04N,../uspto/ipgb20081118.xl.gz,2516,10772334,20081118,07453489
4,The subject matter hereof relates to systems a...,"[3654385, 4803550, 4979042, 5022383, 5047847, ...",20030108,utility,"[5589874, 6537211]",Correction of image signals characteristic of ...,H04N,../uspto/ipgb20081118.xl.gz,2517,10339145,20081118,07453490
5,The shooting equipment communicating system in...,[2000-105431],20030331,utility,"[5296884, 5506644, 6023241, 6222583, 6282362, ...",Shooting equipment communicating system,H04N,../uspto/ipgb20081118.xl.gz,2518,10401706,20081118,07453491
6,A camera system comprising:at least one area i...,"[4641980, 4762986, 4868676, 4937676, 5031049, ...",20040319,utility,[],Portable hand held camera,H04N,../uspto/ipgb20081118.xl.gz,2519,10804042,20081118,07453492
7,A video stabilization scheme which uses a dyna...,[2365244],20041012,utility,"[4885757, 5060074, 6809758, 2003/0090593, 2004...",Image stabilization,H04N,../uspto/ipgb20081118.xl.gz,2520,10963161,20081118,07453493
8,An anti-shake apparatus of a photographing app...,"[5974269, 2003/0067544, 10-142647, 2002-229090...",20050308,utility,"[4356437, 5932984, 6781622, 6985176, 7224893]",Anti-shake apparatus having magnetic position ...,H04N,../uspto/ipgb20081118.xl.gz,2521,11073604,20081118,07453494
9,An electronic camera of the present invention ...,"[5808681, 6018363, 6359649, 6683642, 6920180, ...",20070604,utility,[],Electronic camera for displaying a preview ima...,H04N,../uspto/ipgb20081118.xl.gz,2522,11806830,20081118,07453495


In [68]:
document = root.getchildren()[0]
fields = []
for item in document.iter():
    fields.append(item.tag)
fields = list(set(fields))
fields.sort() 
#etree.tostring(root.find("abstract"), method="text")
#print(fields)
#print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

date = root.find("us-bibliographic-data-grant/publication-reference/document-id/date").text
publishedPatentDocNumber = root.find("us-bibliographic-data-grant/publication-reference/document-id/doc-number").text
applicationPatentDocNumber = root.find("us-bibliographic-data-grant/application-reference/document-id/doc-number").text
kind = root.find("us-bibliographic-data-grant/publication-reference/document-id/kind").text
applicationType = root.find("us-bibliographic-data-grant/application-reference").attrib.get("appl-type")
inventionTitle = root.find("us-bibliographic-data-grant/invention-title").text
numberOfClaims = root.find("us-bibliographic-data-grant/number-of-claims").text

referencesCited = root.find("us-bibliographic-data-grant/references-cited").getchildren()
# remove non-patent citations
referencesCited = [citation for citation in referencesCited if citation.find("patcit") is not None]

examinerCitations = []
applicantCitations = []
for citation in referencesCited:
    citationDocNum = citation.find("patcit/document-id/doc-number").text
    citationType = etree.tostring(citation.find("category"), method="text")
    if citationType == b'cited by examiner':
        examinerCitations.append(citationDocNum)
    else:
        applicantCitations.append(citationDocNum)
        
print(examinerCitations)
#[date, docNumber, kind, applicationType, inventionTitle, numberOfClaims]

['2005/0246536', '2006/0020830', '2006/0059349', '2006/0095775']


In [238]:
etree.tostring(referencesCited[42].find("patcit"))

b'<patcit num="00043"><document-id><country>EP</country><doc-number>1 041 767</doc-number><date>20001000</date></document-id></patcit>'

In [8]:
print(etree.tostring(root, pretty_print=True).decode('UTF-8'))

<us-patent-grant lang="EN" dtd-version="v4.2 2006-08-23" file="US07454797-20081118.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20081104" date-publ="20081118">
  <us-bibliographic-data-grant>
    <publication-reference>
      <document-id>
        <country>US</country>
        <doc-number>07454797</doc-number>
        <kind>B2</kind>
        <date>20081118</date>
      </document-id>
    </publication-reference>
    <application-reference appl-type="utility">
      <document-id>
        <country>US</country>
        <doc-number>10963696</doc-number>
        <date>20041013</date>
      </document-id>
    </application-reference>
    <us-application-series-code>10</us-application-series-code>
    <us-term-of-grant>
      <us-term-extension>819</us-term-extension>
    </us-term-of-grant>
    <classifications-ipcr>
      <classification-ipcr>
        <ipc-version-indicator>
          <date>20060101</date>
        </ipc-version-indicator>
        <classification

In [18]:
ipcrSection = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/section")
ipcrClass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/class")
ipcrSubclass = root.find("us-bibliographic-data-grant/classifications-ipcr/classification-ipcr/subclass")
patentType = ipcrSection.text + ipcrClass.text + ipcrSubclass.text

In [22]:
ipcrSection.text + ipcrClass.text + ipcrSubclass.text

'H04L'