# Querying the EP full-text database with a custom API

* @author: Antoine Mathieu Collin
* @email: antoine.mathieu-collin@kuleuven.be
* @article: This code is part of the article: "XXXX"
* @description: Makes use of the custom API to query the EP full-text database database, the PATSTAT data previously retrieved with the `query_patstat.ipynb` notebook.
_____

In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
# location of the PATSTAT data previously retrieved with the data_extraction_from_PATSTAT.ipynb notebook
output_files_prefix = 'test_query'
pre = '../data/raw/' + output_files_prefix
suf = '.csv'

In [3]:
# for convenience, we store all the data retrieved into a data object.
table_main_patent_infos = pd.read_csv(pre + '_table_main_patent_infos' + suf, low_memory=False)

In [None]:
class DataRetrieverEPFullText:
    
    pub_nbs = []
    pub_data = {}
    
    
    # EP full text data characteristics
    path = r'../data/ep_full_text_database/2020_edition/EP'
    regex = "*.txt"
    sep = '\t'
    new_col_names = [
        'publication_authority', 
        # will always have the value "EP"
        'publication_number', 
        # a seven-digit number
        'publication_kind', 
        # see https://www.epo.org/searching-for-patents/helpful-resources/first-time-here/definitions.html for help.
        'publication_date', 
        # in format YYYY-MM-DD
        'language_text_component', 
        # de, en, fr; xx means unknown
        'text_type', 
        # TITLE, ABSTR, DESCR, CLAIM, AMEND, ACSTM, SREPT, PDFEP
        'text' 
        # it contains, where appropriate, XML tags for better structure. You will find the DTD applicable to all parts of the publication at: http://docs.epoline.org/ebd/doc/ep-patent-document-v1-5.dtd
    ]
    
    limit_EP_publication_numbers = 3600000
    
    
    def __init__(self, data):
        """ The data to be inserted is the table called '_table_main_patent_infos' """
        self.data = data
       
    
    def get_data():
        """ 
        1) Extract the publication numbers from the '_table_main_patent_infos'
        2) Search in the EP-full text data all associated publications and return them in a dataframe
        """
        self._get_publication_numbers()
         
    
    def get_publication_numbers(self):
        """
        This function retrives from a given PATSTAT dataset the list of associated publication numbers:
        publication numbers can be found in the variable 'publn_nr' but also in the variable 
        'publn_nr_original' ('old' publication numbers). 

        This fonction therefore extracts the content of the two columns, appends them toghether and 
        return a list.
        """

        data = self.data
        first_col = 'publn_nr'
        second_col = 'publn_nr_original'

        # retrieve the content of the first column ('new' publication numbers)
        list_pubs_numbers = (
            data[first_col]
            .to_frame()
            .reset_index()
            [first_col]
            .unique()
            .tolist()
        )

        # retrieve the content of the second column ('old' publication numbers)
        list_pubs_old_numbers = (
            data[second_col]
            .to_frame()
            .reset_index()
            [second_col]
            .unique()
            .tolist()
        )

        # append the two lists
        publication_numbers = list_pubs_numbers + list_pubs_old_numbers
        self.pub_nbs = list_pubs_numbers
        
        
    def clean_publication_numbers(self):
        """
        Removes publication numbers which cannot be found in the EP full text database

        # the publication nr can appear in the EP-full_text data only of it:
        # it is a seven-digits nb
        # it does not contain letters
        # <= 3600000
        """
        self.pub_nbs = [e for e in self.list_pubs_numbers \
                             if len(str(e))==7 \
                             if any(f.isalpha() for f in e)==False \
                             if '-' not in e \
                             if int(e) < self.limit_EP_publication_numbers]

        
    def create_buckets(self):
    
        """
        We create buckets in order to retrieve the data associated to the publication
        numbers in chunks
        """

        # we store the publication numbers in a dictionnary according
        buckets = list(np.sort(list(set([e[:2] for e in self.pub_nbs]))))

        pub_data = {}
        for bucket in buckets:
            pub_ids = [e for e in self.pub_nbs if e[:2] == bucket]
            pub_data.update({bucket: pub_ids})
        self.pub_data
        
    
    def get_df(self):
        """Retrieve the Pandas dataframe associated to the publication numbers list"""

        

        # since the list is predefined, this returns only one element (without the [0])
        df_to_open_nb = str(list(set([(str(e)[:2]) for e in list_publication_numbers]))[0])

        # looking for all corresponding files in the path folder
        file = glob.glob(path + df_to_open_nb + regex)[0]

        # print info
        print('Retrieving data from ', file)

        # opening the right file
        data_sample = pd.read_csv(file, sep = sep)

        # renaming the columns
        data_sample.columns = new_col_names

        # filtering the dataset with the list of publication numbers
        condition = data_sample.publication_number.isin(list_publication_numbers)

        return data_sample[condition]

In [4]:
def get_publication_numbers_from_PATSTAT(table_main_patent_infos):
    """
    This function retrives from a given PATSTAT dataset the list of associated publication numbers:
    publication numbers can be found in the variable 'publn_nr' but also in the variable 
    'publn_nr_original' ('old' publication numbers). 
    
    This fonction therefore extracts the content of the two columns, appends them toghether and 
    return a list.
    """
    
    first_col = 'publn_nr'
    second_col = 'publn_nr_original'
    
    # retrieve the content of the first column ('new' publication numbers)
    list_pubs_numbers = (
        table_main_patent_infos[first_col]
        .to_frame()
        .reset_index()
        [first_col]
        .unique()
        .tolist()
    )

    # retrieve the content of the second column ('old' publication numbers)
    list_pubs_old_numbers = (
        table_main_patent_infos[second_col]
        .to_frame()
        .reset_index()
        [second_col]
        .unique()
        .tolist()
    )
    
    # append the two lists
    publication_numbers = list_pubs_numbers + list_pubs_old_numbers
    
    return list_pubs_numbers

In [5]:
def clean_publication_numbers(list_pubs_numbers):
    """
    Removes publication numbers which cannot be found in the EP full text database
    
    # the publication nr can appear in the EP-full_text data only of it:
    # it is a seven-digits nb
    # it does not contain letters
    # <= 3600000
    """
    
    limit_EP_publication_numbers = 3600000

    list_pubs_numbers = [e for e in list_pubs_numbers \
                         if len(str(e))==7 \
                         if any(f.isalpha() for f in e)==False \
                         if '-' not in e \
                         if int(e) < limit_EP_publication_numbers]
    
    return list_pubs_numbers

In [6]:
def create_buckets(list_pubs_numbers):
    
    """
    We create buckets in order to retrieve the data associated to the publication
    numbers in chunks
    """

    # we store the publication numbers in a dictionnary according
    buckets = list(np.sort(list(set([e[:2] for e in list_pubs_numbers]))))

    pub_data = {}
    for bucket in buckets:
        pub_ids = [e for e in list_pubs_numbers if e[:2] == bucket]
        pub_data.update({bucket: pub_ids})
    return pub_data

In [7]:
def get_df(list_publication_numbers):
    """Retrieve the Pandas dataframe associated to the publication numbers list"""
    
    # EP full text data characteristics
    path = r'../data/ep_full_text_database/2020_edition/EP'
    regex = "*.txt"
    sep = '\t'
    new_col_names = ['publication_authority', # will always have the value "EP"
                    'publication_number', # a seven-digit number
                    'publication_kind', # see https://www.epo.org/searching-for-patents/helpful-resources/first-time-here/definitions.html for help.
                    'publication_date', # in format YYYY-MM-DD
                    'language_text_component', # de, en, fr; xx means unknown
                    'text_type', # TITLE, ABSTR, DESCR, CLAIM, AMEND, ACSTM, SREPT, PDFEP
                    'text' # it contains, where appropriate, XML tags for better structure. You will find the DTD applicable to all parts of the publication at: http://docs.epoline.org/ebd/doc/ep-patent-document-v1-5.dtd
                   ]
    
    # since the list is predefined, this returns only one element (without the [0])
    df_to_open_nb = str(list(set([(str(e)[:2]) for e in list_publication_numbers]))[0])
    
    # looking for all corresponding files in the path folder
    file = glob.glob(path + df_to_open_nb + regex)[0]
    
    # print info
    print('Retrieving data from ', file)
    
    # opening the right file
    data_sample = pd.read_csv(file, sep = sep)
    
    # renaming the columns
    data_sample.columns = new_col_names
    
    # filtering the dataset with the list of publication numbers
    condition = data_sample.publication_number.isin(list_publication_numbers)
    
    return data_sample[condition]

In [8]:
def get_data(pub_data):
    """We loop over the buckets to retrieve the data"""

    # for each bucket, we look for the data in the corresponding input file
    list_df = []
    for i, bucket in enumerate(pub_data.keys()):
        print('Bucket {} out of {}'.format(i+1, len(pub_data.keys())))
        l = pub_data[bucket]
        list_df.append(get_df(l))

    result_df = pd.concat(list_df)
    return data

In [9]:
def store_result(result_df, pre, suf):
    """Store the result in a csv file"""
    
    file = pre + '_full_text' + suf # where to save
    print('Saving results in {}'.format(file))
    result_df.to_csv(file)

In [10]:
list_pubs_numbers = get_publication_numbers_from_PATSTAT(table_main_patent_infos)

In [11]:
list_pubs_numbers = clean_publication_numbers(list_pubs_numbers)

In [12]:
pub_data = create_buckets(list_pubs_numbers)

In [13]:
%%time
data = get_data(pub_data)

Bucket 1 out of 16
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0300000.txt
Bucket 2 out of 16
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0400000.txt
Bucket 3 out of 16
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0500000.txt
Bucket 4 out of 16
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0600000.txt


KeyboardInterrupt: 