# Developing Functions for EPO Register Retrieval

Now let's build some data processing functions to strip out some of the noise from the returned data.  

These functions can be added to PatentData.  

We will retrieve XML because we can use Beautiful Soup to parse.

In [4]:
# Import OPS details
from epoops import EPOOPS_C_KEY, EPOOPS_C_SECRET

# Import BS4 for parsing
from bs4 import BeautifulSoup

In [5]:
import epo_ops

try:
    middlewares = [
                epo_ops.middlewares.Dogpile(),
                epo_ops.middlewares.Throttler(),
            ]
except:
    middlewares = [
                epo_ops.middlewares.Throttler()
            ]

client = epo_ops.Client(
            key=EPOOPS_C_KEY,
            secret=EPOOPS_C_SECRET,
            accept_type='xml',
            middlewares=middlewares)

## Register Bibliographic Data

From this we can get the applicant and representative on file.

In [6]:
publication_number = "EP2080165"
response = client.register(reference_type='publication', input=epo_ops.models.Epodoc(publication_number), constituents=['biblio'])

In [7]:
response

<Response [200]>

In [10]:
print(response.text)

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<ops:world-patent-data xmlns:ops="http://ops.epo.org" xmlns:reg="http://www.epo.org/register" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:cpc="http://www.epo.org/cpcexport" xmlns:cpcdef="http://www.epo.org/cpcdefinition">
    <ops:register-search total-result-count="1">
        <ops:query syntax="CQL">publication=EP2080165</ops:query>
        <ops:range begin="1" end="1"/>
        <reg:register-documents produced-by="RO">
            <reg:register-document date-produced="20170917" dtd-version="1.3.1" lang="en" produced-by="RO" status="The application has been withdrawn">
                <reg:ep-patent-statuses>
                    <reg:ep-patent-status change-date="" status-code="9">The application has been withdrawn</reg:ep-patent-status>
                </reg:ep-patent-statuses>
                <reg:bibliographic-data country="EP" id="EP07844381P" lang="en" status="The application has been withdrawn">
                    <r

In [9]:
soup = BeautifulSoup(response.text, "xml")

In [13]:
# Need to narrow to applicants section first
applicant_sequence = soup.find("applicants")
print(applicant_sequence)

<reg:applicants change-date="20090619" change-gazette-num="N/P">
<reg:applicant app-type="applicant" designation="all" sequence="1">
<reg:addressbook cdsid="0101034879">
<reg:name>Trading Technologies International, Inc.</reg:name>
<reg:address>
<reg:address-1>222 S. Riverside Plaza, Suite 1100</reg:address-1>
<reg:address-2>Chicago, Illinois 60606</reg:address-2>
<reg:country>US</reg:country>
</reg:address>
</reg:addressbook>
<reg:nationality>
<reg:country/>
</reg:nationality>
<reg:residence>
<reg:country/>
</reg:residence>
</reg:applicant>
</reg:applicants>


In [15]:
applicant_list = applicant_sequence.find_all('applicant')
print(applicant_list)

[<reg:applicant app-type="applicant" designation="all" sequence="1">
<reg:addressbook cdsid="0101034879">
<reg:name>Trading Technologies International, Inc.</reg:name>
<reg:address>
<reg:address-1>222 S. Riverside Plaza, Suite 1100</reg:address-1>
<reg:address-2>Chicago, Illinois 60606</reg:address-2>
<reg:country>US</reg:country>
</reg:address>
</reg:addressbook>
<reg:nationality>
<reg:country/>
</reg:nationality>
<reg:residence>
<reg:country/>
</reg:residence>
</reg:applicant>]


In [18]:
# See here - https://stackoverflow.com/questions/26872311/using-beautifulsoup-to-parse-xml-to-a-dictionary
applicant_dict = {child.name: child.text for child in applicant_list[0].findChildren()}
print(applicant_dict)

{'residence': '\n\n', 'country': '', 'addressbook': '\nTrading Technologies International, Inc.\n\n222 S. Riverside Plaza, Suite 1100\nChicago, Illinois 60606\nUS\n\n', 'nationality': '\n\n', 'address': '\n222 S. Riverside Plaza, Suite 1100\nChicago, Illinois 60606\nUS\n', 'address-1': '222 S. Riverside Plaza, Suite 1100', 'address-2': 'Chicago, Illinois 60606', 'name': 'Trading Technologies International, Inc.'}


In [19]:
print(applicant_dict['name'], "\n", applicant_dict['address'])

Trading Technologies International, Inc. 
 
222 S. Riverside Plaza, Suite 1100
Chicago, Illinois 60606
US



In [23]:
# Combining into a function
def get_applicant(client, number, numbertype='publication'):
    """ Return applicant details for an EP application."""
    #if numbertype == 'application':
        # number = self.get_publication_no(number, "EP")
    response = client.register(
            reference_type='publication',
            input=epo_ops.models.Epodoc(number),
            constituents=['biblio']
            )
    soup = BeautifulSoup(response.text, "xml")
    # Need to narrow to applicants section first
    applicant_sequence = soup.find("applicants")
    applicant_list = applicant_sequence.find_all('applicant')
    # This just returns the details for the first applicant
    # Check this is the most recent where changes have occurred
    applicant_dict = {
            child.name: child.text
            for child in applicant_list[0].findChildren()
            }
    return applicant_dict

In [26]:
ap = get_applicant(client, "EP123457")
print(ap['name'], "\n", ap['address'])

E. I. du Pont de Nemours and Company 
 
1007 Market Street
Wilmington, DE 19898
US



In [27]:
# Adapt to get agent details - we can use a common parent function for this
def get_agent(client, number, numbertype='publication'):
    """ Return applicant details for an EP application."""
    #if numbertype == 'application':
        # number = self.get_publication_no(number, "EP")
    response = client.register(
            reference_type='publication',
            input=epo_ops.models.Epodoc(number),
            constituents=['biblio']
            )
    soup = BeautifulSoup(response.text, "xml")
    # Need to narrow to applicants section first
    applicant_sequence = soup.find("agents")
    applicant_list = applicant_sequence.find_all('agent')
    # This just returns the details for the first applicant
    # Check this is the most recent where changes have occurred
    applicant_dict = {
            child.name: child.text
            for child in applicant_list[0].findChildren()
            }
    return applicant_dict

In [28]:
ag = get_agent(client, "EP123457")
print(ag['name'], "\n", ag['address'])

Barnard, Eric Edward, et al 
 
Brookes Batchellor 102-108 Clerkenwell Road
London EC1M 5SA
GB



In [31]:
# General function to get applicant,inventor or agent details as list
def get_party_details(client, number, party, numbertype='publication'):
    """ Return applicant details for an EP application."""
    #if numbertype == 'application':
        # number = self.get_publication_no(number, "EP")
    response = client.register(
            reference_type='publication',
            input=epo_ops.models.Epodoc(number),
            constituents=['biblio']
            )
    soup = BeautifulSoup(response.text, "xml")
    # Need to narrow to applicants section first
    return [
        {
            child.name: child.text
            for child in entry.findChildren()
        }
        for entry in soup.find("{0}s".format(party)).find_all(party)
    ]

In [32]:
parties = get_party_details(client, "EP1234567", "agent")

In [33]:
parties

[{'address': '\nPartnerschaft von\nPatent- und Rechtsanwälten mbB\nBeselerstrasse 4\n22607 Hamburg\nDE\n',
  'address-1': 'Partnerschaft von',
  'address-2': 'Patent- und Rechtsanwälten mbB',
  'address-3': 'Beselerstrasse 4',
  'address-4': '22607 Hamburg',
  'addressbook': '\nUexküll & Stolberg\n\nPartnerschaft von\nPatent- und Rechtsanwälten mbB\nBeselerstrasse 4\n22607 Hamburg\nDE\n\n',
  'country': 'DE',
  'name': 'Uexküll & Stolberg'}]

In [34]:
get_party_details(client, "EP1234567", "applicant")

[{'address': '\nBendererstrasse 2\n9494 Schaan\nLI\n',
  'address-1': 'Bendererstrasse 2',
  'address-2': '9494 Schaan',
  'addressbook': '\nIvoclar Vivadent AG\n\nBendererstrasse 2\n9494 Schaan\nLI\n\n',
  'country': '',
  'name': 'Ivoclar Vivadent AG',
  'nationality': '\n\n',
  'residence': '\n\n'}]

** Observations **
* We could have an object per number to restrict multiple calls to the remote API - e.g. reg = EPRegister(number), reg.applicants > applicant list.
* On init we fetch each of the three data sources.
* We can use to quickly find the agents for a particular applicant - perform a search to get a number list per applicant and then send off register queries for each number.

## Procedural Steps

In [35]:
publication_number = "EP2080165"
response2 = client.register(reference_type='publication', input=epo_ops.models.Epodoc(publication_number), constituents=['procedural-steps'])

In [44]:
print(response2.text)

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<ops:world-patent-data xmlns:ops="http://ops.epo.org" xmlns:reg="http://www.epo.org/register" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:cpc="http://www.epo.org/cpcexport" xmlns:cpcdef="http://www.epo.org/cpcdefinition">
    <ops:register-search total-result-count="1">
        <ops:query syntax="CQL">publication=EP2080165</ops:query>
        <ops:range begin="1" end="1"/>
        <reg:register-documents produced-by="RO">
            <reg:register-document date-produced="20170917" dtd-version="1.3.1" lang="en" produced-by="RO" status="The application has been withdrawn">
                <reg:ep-patent-statuses>
                    <reg:ep-patent-status change-date="" status-code="9">The application has been withdrawn</reg:ep-patent-status>
                </reg:ep-patent-statuses>
                <reg:procedural-data>
                    <reg:procedural-step id="RENEWAL_53183972" procedure-step-phase="undefined">
            

In [37]:
soup2 = BeautifulSoup(response2.text, 'xml')

In [38]:
soup2.find_all("procedural-step-text")

[<reg:procedural-step-text step-text-type="STEP_DESCRIPTION">Renewal fee payment</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="YEAR">03</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="STEP_DESCRIPTION">Renewal fee payment</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="YEAR">04</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="STEP_DESCRIPTION">Renewal fee payment</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="YEAR">05</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="STEP_DESCRIPTION">Renewal fee payment</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="YEAR">06</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="STEP_DESCRIPTION">Renewal fee payment</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="YEAR">07</reg:procedural-step-text>,
 <reg:procedural-step-text step-text-type="STEP_DESCRIP

** Observations **

This is like the front page register data. We want to look at (in particular), the "Examination Procedure" style information.

There is a ```procedure-step-phase="examination"``` as an attribute for the tag - ```<procedural-step>```.

In [None]:
p_steps = [
        {
            child.name: child.text
            for child in step.findChildren()
        }
        for step in soup2.find_all(attrs={"procedure-step-phase":"examination"})
    ]

In [47]:
for s in p_steps:
    print(s, '\n')

{'date': '20160818', 'procedural-step-code': 'EXRE', 'time-limit': '04', 'procedural-step-text': 'Communication from the examining division', 'procedural-step-date': '\n20160818\n'} 

{'procedural-step-code': 'PROL', 'procedural-step-text': 'en'} 

{'date': '20120620', 'procedural-step-code': 'ABEX', 'procedural-step-text': '(claims and/or description)', 'procedural-step-date': '\n20120620\n'} 

{'date': '20160818', 'procedural-step-code': 'DDIV', 'procedural-step-text': '07844381', 'procedural-step-date': '\n20160818\n'} 

{'procedural-step-code': 'IGRA', 'procedural-step-text': 'Intention to grant the patent'} 



Looking at the data I think I actually want "events" for examination events as set out below.

## Events

The following data would be useful:
* Last office action (and is outstanding?).
* History of office action and amendment.
* Number of substantive office actions issued.
* Number of susbtantive responses filed.

In [48]:
publication_number = "EP2080165"
response3 = client.register(reference_type='publication', input=epo_ops.models.Epodoc(publication_number), constituents=['events'])

In [51]:
print(response3.text)

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<ops:world-patent-data xmlns:ops="http://ops.epo.org" xmlns:reg="http://www.epo.org/register" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:cpc="http://www.epo.org/cpcexport" xmlns:cpcdef="http://www.epo.org/cpcdefinition">
    <ops:register-search total-result-count="1">
        <ops:query syntax="CQL">publication=EP2080165</ops:query>
        <ops:range begin="1" end="1"/>
        <reg:register-documents produced-by="RO">
            <reg:register-document date-produced="20170917" dtd-version="1.3.1" lang="en" produced-by="RO" status="The application has been withdrawn">
                <reg:ep-patent-statuses>
                    <reg:ep-patent-status change-date="" status-code="9">The application has been withdrawn</reg:ep-patent-status>
                </reg:ep-patent-statuses>
                <reg:events-data>
                    <reg:dossier-event id="EVT_275507527" event-type="new">
                        <reg:event-da

```<ep-patent-statuses>``` gives the current status - e.g. withdrawn or pending. For current actions we want to dobule check the case is still pending.  

Then we have a set of ```<events-data>``` tags in date order containing the activities.

We can maybe ignore events with "PCT" (PCT events) and "RFE" (renewal fee events) in the event-code.  

Can we use a graph modelling library to draw a flow diagram?

In [53]:
soup3 = BeautifulSoup(response3.text, 'xml')
events = soup3.find_all("events-data")
print(events)

[<reg:events-data>
<reg:dossier-event event-type="new" id="EVT_275507527">
<reg:event-date>
<reg:date>20161007</reg:date>
</reg:event-date>
<reg:event-code>0009182</reg:event-code>
<reg:event-text event-text-type="DESCRIPTION">Withdrawal of application</reg:event-text>
<reg:gazette-reference>
<reg:gazette-num>2016/45</reg:gazette-num>
<reg:date>20161109</reg:date>
</reg:gazette-reference>
</reg:dossier-event>
</reg:events-data>, <reg:events-data>
<reg:dossier-event event-type="new" id="EVT_275507528">
<reg:event-date>
<reg:date>20160826</reg:date>
</reg:event-date>
<reg:event-code>EPIDOSNEXR2</reg:event-code>
<reg:event-text event-text-type="DESCRIPTION">New entry: Despatch of examination report + time limit</reg:event-text>
</reg:dossier-event>
</reg:events-data>, <reg:events-data>
<reg:dossier-event event-type="new" id="EVT_275507529">
<reg:event-date>
<reg:date>20160819</reg:date>
</reg:event-date>
<reg:event-code>0009185</reg:event-code>
<reg:event-text event-text-type="DESCRIPTION

In [54]:
events_dict = [
        {
            child.name: child.text
            for child in event.findChildren()
        }
    for event in soup3.find_all("events-data")
]

In [55]:
events_dict

[{'date': '20161109',
  'dossier-event': '\n\n20161007\n\n0009182\nWithdrawal of application\n\n2016/45\n20161109\n\n',
  'event-code': '0009182',
  'event-date': '\n20161007\n',
  'event-text': 'Withdrawal of application',
  'gazette-num': '2016/45',
  'gazette-reference': '\n2016/45\n20161109\n'},
 {'date': '20160826',
  'dossier-event': '\n\n20160826\n\nEPIDOSNEXR2\nNew entry: Despatch of examination report + time limit\n',
  'event-code': 'EPIDOSNEXR2',
  'event-date': '\n20160826\n',
  'event-text': 'New entry: Despatch of examination report + time limit'},
 {'date': '20160921',
  'dossier-event': '\n\n20160819\n\n0009185\nFirst examination report\n\n2016/38\n20160921\n\n',
  'event-code': '0009185',
  'event-date': '\n20160819\n',
  'event-text': 'First examination report',
  'gazette-num': '2016/38',
  'gazette-reference': '\n2016/38\n20160921\n'},
 {'date': '20151103',
  'dossier-event': '\n\n20151103\n\nEPIDOSNRFE2\nNew entry: Renewal fee paid\n',
  'event-code': 'EPIDOSNRFE2'

In [58]:
def get_events(client, number):
    """ Get prosecution events for an EP publication number."""
    response = client.register(reference_type='publication', input=epo_ops.models.Epodoc(number), constituents=['events'])
    soup = BeautifulSoup(response.text, 'xml')
    return [
        {
            child.name: child.text
            for child in event.findChildren()
        }
        for event in soup.find_all("events-data")
    ]

In [59]:
outstanding_pub = "EP3100185"
e = get_events(client, outstanding_pub)
e

[{'date': '20170719',
  'dossier-event': '\n\n20170622\n\n0009016\nSupplementary search report\n\n2017/29\n20170719\n\n',
  'event-code': '0009016',
  'event-date': '\n20170622\n',
  'event-text': 'Supplementary search report',
  'gazette-num': '2017/29',
  'gazette-reference': '\n2017/29\n20170719\n'},
 {'date': '20170719',
  'dossier-event': '\n\n20170616\n\n0008199IPCL\nChange - classification\n\n2017/29\n20170719\n\n',
  'event-code': '0008199IPCL',
  'event-date': '\n20170616\n',
  'event-text': 'Change - classification',
  'gazette-num': '2017/29',
  'gazette-reference': '\n2017/29\n20170719\n'},
 {'date': '20170719',
  'dossier-event': '\n\n20170616\n\n0008199OBSC\nChange - classification\n\n2017/29\n20170719\n\n',
  'event-code': '0008199OBSC',
  'event-date': '\n20170616\n',
  'event-text': 'Change - classification',
  'gazette-num': '2017/29',
  'gazette-reference': '\n2017/29\n20170719\n'},
 {'date': '20170503',
  'dossier-event': '\n\n20170331\n\n0009199EXPT\nChange - exten

Observations - this does not feature the R.70(2) communication.

We also have an RSS feed of the application documents with the URL: ```https://register.epo.org/rssDocuments?application=EP14880554&proc=EP-PCT&lng=en```

Also we have the link to the ZIP archive - https://register.epo.org/download?number=EP14880554&output=zip .

Number in both cases is the application number minus the check digit - 14880554.2.

## Downloading Prosecution History File

The events data above is useful but for prosecution we are more interested in the "documents" section of the Register.

The RSS feed contains links to each document - but we would need to activate the "Load All Pages" JS link.

Or we could download the ZIP file.

In [60]:
import requests

url = "https://register.epo.org/download?number=EP14880554&output=zip"

response = requests.get(url)

with open('EP14880554.zip', 'wb') as f:
    f.write(response.content)

In [64]:
# Build Function
def get_prosecution(application_number):
    """ Download and save EP prosecution history for given application_number."""
    
    # Strip checkdigit
    if "." in application_number:
        application_number = application_number.split(".")[0]
       
    url = "https://register.epo.org/download?number=EP{0}&output=zip".format(application_number)

    response = requests.get(url)

    with open('EP{0}.zip'.format(application_number), 'wb') as f:
        f.write(response.content)
    print("Prosecution saved")

In [66]:
application_number = "14802601.6"
get_prosecution(application_number)

Prosecution saved


In [67]:
application_number = "14762986.9"
get_prosecution(application_number)

Prosecution saved


In [69]:
application_number = "07799914.2"
get_prosecution(application_number)

Prosecution saved


Need to search from bottom (i.e. newest) for first filename match according to legal basis.  

Extract date from filename.  

Can also locate office action for OCR!

Filename mappings:
* R161 - 14802601-2017-06-07-1226AA-Communication concerning correction of deficiencies in written opinion_amendment of application_payment of claims fee.pdf
* R70(2) EPC - 14880554-2017-07-07-1224-Invitation to declare maintenance of the application and to correct deficiencies in the Written Opinion_amend application.pdf
* A94(3) EPC - 14762986-2017-08-24-2001-Communication from the Examining Division.pdf
* Oral Proc - 07799914-2017-05-08-2008-1-Summons to attend oral proceedings.pdf

Ah can just use code mappings - after the date.

We can use regex to parse the filenames and extract the components (looks for x in filename won't work as 2008 is likely in filenames dated in 2008). See https://regex101.com/r/A4uQ8N/1



In [70]:
import zipfile

filename = "EP14762986_2A943.zip"
with zipfile.ZipFile(filename, 'r') as myzip:
    files = myzip.namelist()
print(files)

['14762986-2014-09-18-A1PAMPHLET-International publication of the A1 Pamphlet.pdf', '14762986-2014-09-18-ISR-Copy of the international search report.pdf', '14762986-2014-10-08-PRIODOC-X-Priority document (electronically transmitted).pdf', '14762986-2015-07-22-1201-1-Information on entry into European phase.pdf', '14762986-2015-08-20-1200P-Request for entry into the European phase.pdf', '14762986-2015-08-20-RECEIPT-OLF-(Electronic) Receipt.pdf', '14762986-2015-10-01-IPRP-Copy of the international preliminary report on patentability.pdf', '14762986-2015-10-21-1226CC-Communication regarding possible amendment of the application_payment of claims fee.pdf', '14762986-2015-10-19-1204-Communication to designated inventor.pdf', '14762986-2015-12-03-1038-Letter accompanying subsequently filed items.pdf', '14762986-2015-12-03-ABEX-Amendments received before examination.pdf', '14762986-2015-12-03-CLMSABEX-Amended claims filed after receipt of (European) search report.pdf', '14762986-2015-12-03-CL

In [76]:
import re
from datetime import datetime

test_str = files[0]
regex = r"(?P<app_no>\d{8})-(?P<date>\d{4}-\d{2}-\d{2})-(?P<number>[A-Z\-\d]+)-(?P<name>.+)\.pdf"

match = re.match(regex, test_str)
parsed_file = {
    'filename': test_str,
    'date': datetime.strptime(match.group(2), "%Y-%m-%d"),
    'code': match.group(3),
    'title': match.group(4)
}

print(parsed_file)

{'date': datetime.datetime(2014, 9, 18, 0, 0), 'code': 'A1PAMPHLET', 'filename': '14762986-2014-09-18-A1PAMPHLET-International publication of the A1 Pamphlet.pdf', 'title': 'International publication of the A1 Pamphlet'}


In [88]:
# Bring it together in a parsing function

def parse_filename(filename):
    """ Extract data from EPO Document Filename."""
    regex = r"(?P<app_no>\d{8})-(?P<date>\d{4}-\d{2}-\d{2})-(?P<number>[A-Z\-\d]+)-(?P<name>.+)\.pdf"
    match = re.match(regex, filename)
    if match:
        return {
            'filename': filename,
            'date': datetime.strptime(match.group(2), "%Y-%m-%d"),
            'code': match.group(3),
            'title': match.group(4)
        }
    else:
        return {'filename': filename}

In [90]:
mapping_dict = {
    '1226AA':'Communication pursuant to Rules 161(1) and 162 EPC',
    '1224':'Communication pursuant to Rules 70(2) and 70a(2) EPC',
    '2001':'Communication pursuant to Article 94(3) EPC',
    '2008':'Summons to attend Oral Proceedings pursuant to Rule 115(1) EPC'
}

for file in reversed(files):
    # print(file)
    parsed_data = parse_filename(file)
    # print(parsed_data)
    if parsed_data.get('code') in mapping_dict.keys():
        print(file)
        break

14762986-2017-08-24-2001-Communication from the Examining Division.pdf


In [81]:
'2008' in mapping_dict.keys()

True

In [91]:
parsed_data

{'code': '2001',
 'date': datetime.datetime(2017, 8, 24, 0, 0),
 'filename': '14762986-2017-08-24-2001-Communication from the Examining Division.pdf',
 'title': 'Communication from the Examining Division'}

In [None]:
letter_text = """
    The Applicant writes in response to the {communication} dated {date}.
"""