# Data retrieval from the EP full-text database

## Data retrieval steps:
* Retrieve all patent publication numbers of interest
* Retrieve the data from the EP-full-text database
* Store it in a unique csv datafile

=>  May take an hour with the current speed.

In [1]:
import pandas as pd
import glob
import numpy as np

## Load PASTAT data

In [2]:
# location of the PATSTAT data previously retrieved with the data_extraction_from_PATSTAT.ipynb notebook
output_files_prefix = "wind_tech_1990_2020_with_publications"
pre = '../data/raw/' + output_files_prefix
suf = '.csv'

In [3]:
# for convenience, we store all the data retrieved into a data object.
table_main_patent_infos = pd.read_csv(pre + '_table_main_patent_infos' + suf, low_memory=False)

## Hash publication numbers

In [4]:
limit_EP_publication_numbers = 3600000

In [5]:
# storing publication numbers in a list
list_pubs_numbers = table_main_patent_infos['publn_nr'].to_frame().reset_index()['publn_nr'].unique().tolist()

# the publication nr can appear in the EP-full_text data only of it:
# it is a seven-digits nb
# it does not contain letters
# <= 3600000
list_pubs_numbers = [e for e in list_pubs_numbers \
                     if len(str(e))==7 \
                     if any(f.isalpha() for f in e)==False \
                     if int(e) < limit_EP_publication_numbers]

In [6]:
# we store the publication numbers in a dictionnary according
buckets = list(np.sort(list(set([e[:2] for e in list_pubs_numbers]))))

pub_data = {}
for bucket in buckets:
    pub_ids = [e for e in list_pubs_numbers if e[:2] == bucket]
    pub_data.update({bucket: pub_ids})

## Retrieve corresponding full-text data

In [7]:
def get_df(list_publication_numbers):
    """Retrieve the Pandas dataframe associated to the publication numbers list"""
    
    # EP full text data characteristics
    path = r'../data/ep_full_text_database/2020_edition/EP'
    regex = "*.txt"
    sep = '\t'
    new_col_names = ['publication_authority', # will always have the value "EP"
                    'publication_number', # a seven-digit number
                    'publication_kind', # see https://www.epo.org/searching-for-patents/helpful-resources/first-time-here/definitions.html for help.
                    'publication_date', # in format YYYY-MM-DD
                    'language_text_component', # de, en, fr; xx means unknown
                    'text_type', # TITLE, ABSTR, DESCR, CLAIM, AMEND, ACSTM, SREPT, PDFEP
                    'text' # it contains, where appropriate, XML tags for better structure. You will find the DTD applicable to all parts of the publication at: http://docs.epoline.org/ebd/doc/ep-patent-document-v1-5.dtd
                   ]
    
    # since the list is predefined, this returns only one element (without the [0])
    df_to_open_nb = str(list(set([(str(e)[:2]) for e in list_publication_numbers]))[0])
    
    # looking for all corresponding files in the path folder
    file = glob.glob(path + df_to_open_nb + regex)[0]
    
    # print info
    print('Retrieving data from ', file)
    
    # opening the right file
    data_sample = pd.read_csv(file, sep = sep)
    
    # renaming the columns
    data_sample.columns = new_col_names
    
    # filtering the dataset with the list of publication numbers
    condition = data_sample.publication_number.isin(list_publication_numbers)
    
    return data_sample[condition]

In [8]:
%%time

# for each bucket, we look for the data in the corresponding input file
list_df = []
for i, bucket in enumerate(pub_data.keys()):
    print('Bucket {} out of {}'.format(i+1, len(pub_data.keys())))
    l = pub_data[bucket]
    list_df.append(get_df(l))
    
result_df = pd.concat(list_df)

Bucket 1 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0000000.txt
Bucket 2 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0100000.txt
Bucket 5 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0400000.txt
Bucket 6 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0500000.txt
Bucket 7 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0600000.txt
Bucket 8 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0700000.txt
Bucket 9 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0800000.txt
Bucket 10 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP0900000.txt
Bucket 11 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP1000000.txt
Bucket 12 out of 36
Retrieving data from  ../data/ep_full_text_database/2020_edition/EP1100000.txt
Bucket 13 out of 

## Store the result in a csv file

In [9]:
file = pre + '_full_text' + suf # where to save
print('Saving results in {}'.format(file))
result_df.to_csv(file)

Saving results in ../data/raw/wind_tech_1990_2020_with_publications_full_text.csv
