# Data extraction from PATSTAT
We create a custom engine to query the PostgreSQL PATSTAT database using postgresql.

## Introduction 

In [1]:
# Imports
import pandas as pd
import sys
from sqlalchemy import inspect
from sqlalchemy import create_engine

In [2]:
# Model modules
sys.path.append("../models")
import data_variables as var # Names of the PATSTAT data variables
import read_sql_tmpfile as rSQLtemp # Snippet to speed up large SQL queries by loading them in a temporary file

## Configuration

In [3]:
class Config:
    """Configuration of the data-retrival engine"""
    
    PASTAT_location = 'postgresql://postgres:postgres2020@127.0.0.1:5432/patstat2018a'
    
    sql_query_PATENT_PRIMARY_INFO = """
            SELECT tls201_appln.appln_id, tls201_appln.DOCDB_FAMILY_ID,tls201_appln.EARLIEST_FILING_DATE, tls201_appln.EARLIEST_FILING_YEAR, tls201_appln.NB_CITING_DOCDB_FAM, cpc_class_symbol
            FROM tls201_appln JOIN tls224_appln_cpc ON tls201_appln.appln_id = tls224_appln_cpc.appln_id
            WHERE cpc_class_symbol like '{}%%'
            AND appln_filing_year between {} and {}
            ORDER BY tls201_appln.appln_id
            """
    
    # query modify to contain the PAT_PUBLN data (may slow down the query)
    # LEFT JOIN TLS211_PAT_PUBLN ON temporary_table_patent_ids.appln_id = TLS211_PAT_PUBLN.appln_id
    sql_query_PATENT_MAIN_INFO = """
            SELECT * 
            FROM temporary_table_patent_ids
            LEFT JOIN tls201_appln ON temporary_table_patent_ids.appln_id = tls201_appln.appln_id
            LEFT JOIN TLS202_APPLN_TITLE ON temporary_table_patent_ids.appln_id = TLS202_APPLN_TITLE.appln_id
            LEFT JOIN TLS203_APPLN_ABSTR ON temporary_table_patent_ids.appln_id = TLS203_APPLN_ABSTR.appln_id
            LEFT JOIN TLS209_APPLN_IPC ON temporary_table_patent_ids.appln_id = TLS209_APPLN_IPC.appln_id
            LEFT JOIN TLS229_APPLN_NACE2 ON temporary_table_patent_ids.appln_id = TLS229_APPLN_NACE2.appln_id
            LEFT JOIN TLS211_PAT_PUBLN ON temporary_table_patent_ids.appln_id = TLS211_PAT_PUBLN.appln_id
            """
    
    # Maybe better to add it in a separate SQL query
    
    sql_query_CPC_INFO = """
            SELECT * 
            FROM temporary_table_patent_ids
            LEFT JOIN TLS224_APPLN_CPC ON temporary_table_patent_ids.appln_id = TLS224_APPLN_CPC.appln_id 
            """
    
    sql_query_PATENTEES_INFO = """
            SELECT * 
            FROM temporary_table_patent_ids
            LEFT JOIN TLS207_PERS_APPLN ON temporary_table_patent_ids.appln_id = TLS207_PERS_APPLN.appln_id
            LEFT JOIN TLS206_PERSON ON TLS207_PERS_APPLN.PERSON_ID = TLS206_PERSON.PERSON_ID
            LEFT JOIN TLS226_PERSON_ORIG ON TLS206_PERSON.PERSON_ID = TLS226_PERSON_ORIG.PERSON_ID
            --LEFT JOIN TLS228_DOCDB_FAM_CITN ON tls201_appln.DOCDB_FAMILY_ID = TLS228_DOCDB_FAM_CITN.DOCDB_FAMILY_ID
            """
    
    sql_query_DOCDB_backwards_citations = """
            SELECT * 
            FROM docdb_family_ids
            LEFT JOIN TLS228_DOCDB_FAM_CITN ON docdb_family_ids.docdb_family_id = TLS228_DOCDB_FAM_CITN.docdb_family_id
            """
    
    sql_query_FORWARD_CITATIONS = """
            SELECT docdb_family_ids.DOCDB_FAMILY_ID, TLS228_DOCDB_FAM_CITN.DOCDB_FAMILY_ID,
            TLS228_DOCDB_FAM_CITN.CITED_DOCDB_FAMILY_ID
            FROM docdb_family_ids JOIN TLS228_DOCDB_FAM_CITN 
            ON docdb_family_ids.DOCDB_FAMILY_ID = TLS228_DOCDB_FAM_CITN.CITED_DOCDB_FAMILY_ID
            """
    
    temp_table_patent_ids = 'temporary_table_patent_ids'
    temp_table_fam_ids = 'docdb_family_ids'

## Data retrieval model

In [4]:
class Query:
    """Parameters of the PATSTAT query"""
    
    # Input variables
    technology_classes: list
    start_year: int
    end_year: int
        
    # Output variables
    output_files_prefix: str
        
    def __init__(self, technology_classes, start_year, end_year, output_files_prefix):
        self.technology_classes = technology_classes
        self.start_year = start_year
        self.end_year = end_year
        self.output_files_prefix = output_files_prefix

In [5]:
class DataRetrievalModel(Config):
    """Engine definition"""
    
    # Model variables 
    
    # Defined with the initialisation
    query: Query
    
    # Defined once the model is fitted
    patent_ids: list
    docdb_ids: list
        
    # Result datasets
    TABLE_MAIN_PATENT_INFOS: pd.DataFrame()
    TABLE_CPC: pd.DataFrame()
    TABLE_PATENTEES_INFO: pd.DataFrame()
    TABLE_DOCDB_BACKWARD_CITATIONS: pd.DataFrame()
    TABLE_DOCDB_FORWARD_CITATIONS: pd.DataFrame()
    
    def __init__(self, query: Query):
        self.query = query
    
    def _connect_engine(self):
        """Creation of the SQL alchemy engine and connection to the database"""
        self.engine = create_engine(Config.PASTAT_location)
        self.connection = self.engine.connect()
        
    def _create_temporary_table(self, df, table_name):
        """Creation of temporary tables in the database"""
        df.to_sql(table_name, self.engine, if_exists="replace")
        
    def _clear_temporary_table(self, table):
        """Clear the temporary table created in the Postgresql database"""
        my_query = 'drop table if exists '+ table
        results = self.connection.execute(my_query)
        
    def _inspect_database_content(self):
        """Returns the name of the tables in the databse"""
        return self.engine.table_names()
    
    def _close_connection(self):
        """We close the connection to the database once we are done querying"""
        self._clear_temporary_table(Config.temp_table_patent_ids)
        self._clear_temporary_table(Config.temp_table_fam_ids)
        self.connection.close()
        
    def _get_primary_info(self): 
        """We retrieve the primary information about the patents"""
        
        # We retrieve the data for all technology classes 1 by 1
        # and store the result in a separate dataframe
        list_df = []
        sql = Config.sql_query_PATENT_PRIMARY_INFO
        
        for technology_class in self.query.technology_classes:
            # We insert in the standart query the parameters chosen
            SQL_query = sql.format(technology_class,
                                   self.query.start_year,
                                   self.query.end_year)
            # We retrieve the data via a temporary file for performance
            t = rSQLtemp.read_sql_tmpfile(SQL_query,self.engine)
            list_df.append(t)

        # Update the variable
        self.TABLE_MAIN_PATENT_INFOS = pd.concat(list_df)
        
    def _select_patents_of_interest(self):
        """After the first retrieval of data, one could prune the list of 
        selected patents in case the query would be too large - for instance
        to eliminate Patent with no value (citation count)"""
        
        # We update the list of patent and DOCDB family ids
        self.patent_ids = (self.TABLE_MAIN_PATENT_INFOS
                           [var.PATSTAT_APPLN_ID]
                           .unique().tolist())
        self.docdb_ids = (self.TABLE_MAIN_PATENT_INFOS
                          [var.PATSTAT_DOCDB_FAMILY_ID]
                          .unique().tolist())
    
    def _create_temp_table_with_patent_ids(self):
        """Creating a temporary table in the SQL database contaning the patent ids
        -> Allows to join on this table using SQL afterwards."""
    
        # Definition of the table
        table_name = Config.temp_table_patent_ids
        t = tuple(self.patent_ids)
        df = pd.DataFrame(t)
        df.columns = [var.PATSTAT_APPLN_ID]
        
        # Creation of the temporary table in the SQLAlchemy database
        self._create_temporary_table(df, table_name)
    
    def _get_general_info(self):
        """Retrieving general information about the selected patents"""
        self.TABLE_MAIN_PATENT_INFOS = rSQLtemp.read_sql_tmpfile(
                                            Config.sql_query_PATENT_MAIN_INFO,
                                            self.engine)
        
    def _get_CPC_classes(self):
        "Retrieving CPC technology classes of the selected patents"
        self.TABLE_CPC = rSQLtemp.read_sql_tmpfile(
                                Config.sql_query_CPC_INFO, 
                                self.engine)
       
    def _get_patentees_info(self):
        """Retrieving information about the patentees (individuals) of the selected patents"""
        self.TABLE_PATENTEES_INFO = rSQLtemp.read_sql_tmpfile(
                                    Config.sql_query_PATENTEES_INFO,
                                    self.engine)
        
    def _create_temp_table_with_DOCDB_ids(self):
        """Creating a temporary table in the SQL database containing the
        docdb_family ids to allow joining on this table on later SQL queries"""

        # Definition of the table
        table_name = Config.temp_table_fam_ids
        t = tuple(self.docdb_ids)
        df = pd.DataFrame(t)
        df.columns = [var.PATSTAT_DOCDB_FAMILY_ID]

        # Creation of the temporary table in the SQLAlchemy database
        self._create_temporary_table(df, table_name)
        
    def _retrieve_backward_docdb_citations(self):
        """Retrieving information about backward citations of the selected families"""
        self.TABLE_DOCDB_BACKWARD_CITATIONS = rSQLtemp.read_sql_tmpfile(
            Config.sql_query_DOCDB_backwards_citations,self.engine)
        
    def _retrieve_forward_docdb_citations(self):
        """Retrieving information about forward citations of the selected families"""
        self.TABLE_DOCDB_FORWARD_CITATIONS = rSQLtemp.read_sql_tmpfile(
            Config.sql_query_FORWARD_CITATIONS,self.engine) 
                
    def _export_result_datasets(self):
        """Exporting the result datasets in the data/raw folder"""
        
        pre = '../data/raw/' + self.query.output_files_prefix
        suf = '.csv'
        
        storage_scheme = {'_table_main_patent_infos' : self.TABLE_MAIN_PATENT_INFOS,
                          '_table_cpc' : self.TABLE_CPC,
                          '_table_patentees_info' : self.TABLE_PATENTEES_INFO ,
                          '_table_backward_docdb_citations' : self.TABLE_DOCDB_BACKWARD_CITATIONS,
                          '_table_forward_docdb_citations' : self.TABLE_DOCDB_FORWARD_CITATIONS}
        
        for path, df in storage_scheme.items():
            path = pre + path + suf 
            df.to_csv(path, index=False)

## Running the model

In [6]:
# Creating the query
query_wind_technologies = Query(
    technology_classes = ['Y02E  10/7'],
    start_year = 1990,
    end_year = 2020,
    output_files_prefix = "wind_tech_1990_2020_with_publications")

In [7]:
# Initialisation of the model with the query as parameter
model = DataRetrievalModel(query = query_wind_technologies)

In [8]:
%%time

# Run the model
print('Connecting_engine')
model._connect_engine()
print('Get primary info')
model._get_primary_info()
print('Selection of patents of interest')
model._select_patents_of_interest()
print('Creating a temporary table with patent ids')
model._create_temp_table_with_patent_ids()
print('Get general info')
model._get_general_info()
print('Get the CPC classes of the patents')
model._get_CPC_classes()
print('Get info about the patentees')
model._get_patentees_info()
print('Create a temporary table with DOCDB family ids')
model._create_temp_table_with_DOCDB_ids()
print('Get backwards cites')
model._retrieve_backward_docdb_citations()
print('Get forward cites')
model._retrieve_forward_docdb_citations()
print('Export the result datasets in data/raw')
model._export_result_datasets()
print('Closing the conenction to the PostgreSQL database')
model._close_connection()

Connecting_engine
Get primary info
Selection of patents of interest
Creating a temporary table with patent ids
Get general info
Get the CPC classes of the patents
Get info about the patentees
Create a temporary table with DOCDB family ids
Get backwards cites
Get forward cites
Export the result datasets in data/raw
Closing the conenction to the PostgreSQL database
CPU times: user 58.1 s, sys: 8.88 s, total: 1min 7s
Wall time: 11min 2s
