# Identification of emerging technologies using NLP-powered patent networks

## Robustness checks

> Author: **Antoine MATHIEU COLLIN**
* Department of Management, Strategy and Innovation (MSI) of the Faculty of Economics and Business (FEB), KU Leuven
* Department of Computer Science of the Faculty of Engineering Science, KU Leuven
* Leuven.AI, KU Leuven Institute for Artificial Intelligence

# 1. Setup

In [1]:
import pandas as pd

## 1. 1. Loading the input data

In [2]:
# location of the PATSTAT data previously retrieved with the data_extraction_from_PATSTAT.ipynb notebook
output_files_prefix = "wind_tech_1990_2020_with_publications"
pre = '../data/raw/' + output_files_prefix
suf = '.csv'

In [3]:
# for convenience, we store all the data retrieved into a single data object.
data = {'_table_main_patent_infos': pd.read_csv(pre + '_table_main_patent_infos' + suf, low_memory=False),
       '_table_cpc': pd.read_csv(pre + '_table_cpc' + suf, low_memory=False), 
       '_table_patentees_info': pd.read_csv(pre + '_table_patentees_info' + suf, low_memory=False),
       '_table_backward_docdb_citations': pd.read_csv(pre + '_table_backward_docdb_citations' + suf, low_memory=False),
       '_table_forward_docdb_citations': pd.read_csv(pre + '_table_forward_docdb_citations' + suf, low_memory=False),
       '_text_data':pd.read_csv('../data/raw/wind_tech_1990_2020_with_publications_full_text.csv', sep = ',')}

In [4]:
data['_table_main_patent_infos'].head()

Unnamed: 0,index,appln_id,appln_id.1,appln_auth,appln_nr,appln_kind,appln_filing_date,appln_filing_year,appln_nr_epodoc,appln_nr_original,...,pat_publn_id,publn_auth,publn_nr,publn_nr_original,publn_kind,appln_id.6,publn_date,publn_lg,publn_first_grant,publn_claims
0,0,146,146,EP,7015148,A,2007-08-02,2007,EP20070015148,7015148,...,278556884,EP,1892412,,A1,146,2008-02-27,de,0,7
1,0,146,146,EP,7015148,A,2007-08-02,2007,EP20070015148,7015148,...,278556884,EP,1892412,,A1,146,2008-02-27,de,0,7
2,0,146,146,EP,7015148,A,2007-08-02,2007,EP20070015148,7015148,...,278556884,EP,1892412,,A1,146,2008-02-27,de,0,7
3,0,146,146,EP,7015148,A,2007-08-02,2007,EP20070015148,7015148,...,335943971,EP,1892412,,B1,146,2011-07-27,de,1,6
4,0,146,146,EP,7015148,A,2007-08-02,2007,EP20070015148,7015148,...,335943971,EP,1892412,,B1,146,2011-07-27,de,1,6


## 1.2. Loading the model

In [5]:
%run model_API.ipynb

# 2. Robutness checks

## 2.1. Data cleaning statistics

In [6]:
# instantiation of the model
model = Model()
# fitting the model to the data
model._input_data(data)
# new metrics
model._compute_new_metrics()

In [7]:
class wrap:
    """
    Creates a table with, for each data cleaning step:
    - Nb of patents
    - Nb of patent families
    - Average count of famility citations
    - Proportion of patents for which the claim text data is available
    
    """
    
    def select_columns(data):
        """We remove the columns called 'index' as they are unuseful and perturb the merge"""

        cols = ['appln_id',
                'appln_auth',
                'appln_nr',
                'appln_kind',
                'appln_filing_date',
                'appln_filing_year',
                'appln_nr_epodoc',
                'appln_nr_original',
                'ipr_type',
                'receiving_office',
                'internat_appln_id',
                'int_phase',
                'reg_phase',
                'nat_phase',
                'earliest_filing_date',
                'earliest_filing_year',
                'earliest_filing_id',
                'earliest_publn_date',
                'earliest_publn_year',
                'earliest_pat_publn_id',
                'granted',
                'docdb_family_id',
                'inpadoc_family_id',
                'docdb_family_size',
                'nb_citing_docdb_fam',
                'nb_applicants',
                'nb_inventors',
                'appln_title_lg',
                'appln_title',
                'appln_abstract_lg',
                'appln_abstract',
                'ipc_class_symbol',
                'ipc_class_level',
                'ipc_version',
                'ipc_value',
                'ipc_position',
                'ipc_gener_auth',
                'nace2_code',
                'weight',
                'pat_publn_id',
                'publn_auth',
                'publn_nr',
                'publn_nr_original',
                'publn_kind',
                'publn_date',
                'publn_lg',
                'publn_first_grant',
                'publn_claims']

        data['_table_main_patent_infos'] = data['_table_main_patent_infos'][cols]

        cols = ['appln_id',
                'person_id',
                'applt_seq_nr',
                'invt_seq_nr',
                'person_name',
                'person_address',
                'person_ctry_code',
                'doc_std_name_id',
                'doc_std_name',
                'psn_id',
                'psn_name',
                'psn_level',
                'psn_sector',
                'person_orig_id',
                'source',
                'source_version',
                'name_freeform',
                'last_name',
                'first_name',
                'middle_name',
                'address_freeform',
                'address_1',
                'address_2',
                'address_3',
                'address_4',
                'address_5',
                'street',
                'city',
                'zip_code',
                'state',
                'residence_ctry_code',
                'role']
        data['_table_patentees_info'] = data['_table_patentees_info'][cols]
        return data

    def reshape_PATSTAT_data(data):
        """ Reshaping PATSTAT data """

        # retrieve variables of interest from the PATSTAT dataset
        cols = ['appln_id','appln_filing_year','appln_auth','publn_auth',
                'publn_nr','publn_nr_original','publn_lg']

        table_PATSTAT = data['_table_main_patent_infos']
        cols = [col for col in list(table_PATSTAT) if col != 'publn_nr' and col != 'publn_nr_original']

        table_PATSTAT = pd.melt(table_PATSTAT,
                                id_vars=cols,
                                var_name='type_publication_nb',
                                value_name='publn_nr')
        return table_PATSTAT
    
    def reshape_EP_full_text_data(data):
        """ Reshapes the EP full text data to be able to assess the wether a PATSTAT patent is present
        in the database """

        # retrieve variables of interest from the EP full text data
        table_EP_full_text = data['_text_data']
        # renaming the publication number column to align with PATSTAT before merging
        table_EP_full_text.rename(columns={'publication_number':  'publn_nr'}, inplace = True)
        # keep only patent which contains claims
        condition = table_EP_full_text['text_type'] == 'CLAIM'
        table_EP_full_text = table_EP_full_text[condition]
        # drop duplicates and keep only 2 variables
        table_EP_full_text = table_EP_full_text[['publn_nr', 'text_type']]
        table_EP_full_text.drop_duplicates(inplace = True)
        # store the ids as str
        table_EP_full_text['publn_nr'] = table_EP_full_text['publn_nr'].astype(str)
        return table_EP_full_text
    
    def add_citations(data, table_PATSTAT):
        """Adds the citation count to the data"""

        df = data['_table_main_patent_infos']
        mapp = dict(zip(df['appln_id'],df['nb_citing_docdb_fam']))
        table_PATSTAT['nb_citing_docdb_fam'] = table_PATSTAT['appln_id'].map(mapp)
        return table_PATSTAT
    
    def merge_data(data):
        return pd.merge(left = data['_table_main_patent_infos'],
                        right = data['_table_patentees_info'],
                        on='appln_id')
    
    def merge_final_datasets(table_EP_full_text, table_PATSTAT):
    
        # merging the two datasets 
        mapp = dict(zip(table_EP_full_text['publn_nr'],table_EP_full_text['text_type']))
        table_PATSTAT['claims_availability'] = table_PATSTAT['publn_nr'].map(mapp).fillna('No')
        mapp = {}
        mapp['CLAIM'] = 'Yes'
        table_PATSTAT['claims_availability'] = table_PATSTAT['claims_availability'].map(mapp).fillna(table_PATSTAT['claims_availability'])
        # if we have a claim, we drop duplicates for the other (unmatched) publication numbers
        table_PATSTAT.sort_values(by = 'claims_availability', ascending = False, inplace = True)
        table_PATSTAT.drop_duplicates(subset = 'appln_id', inplace = True)
        return table_PATSTAT
    
    def set_plotting_style():
        import matplotlib.pyplot as plt
        # use the 'seaborn-colorblind' style
        plt.style.use('seaborn-paper')
        sns.set(rc={'figure.figsize':(16,4)})
        
    def plot_claim_data_availability(table_PATSTAT):

        import seaborn as sns
        df = table_PATSTAT[table_PATSTAT['appln_auth']=='EP']

        # data availability according to patent authority
        #g = sns.countplot(x='granted', hue='claims_availability', data=table_PATSTAT)
        #g.set_xticklabels(g.get_xticklabels(), rotation=90);
        
        ## I WAS THERE
        nb_claims_available = len(table_PATSTAT[table_PATSTAT['claims_availability']=='Yes'])
        proportion_of_claims_available = nb_claims_available/len(table_PATSTAT)*100
        
        return proportion_of_claims_available
        
    def display_claims_availability(data):
        data = wrap.select_columns(data)
        df = wrap.merge_data(data)

        table_PATSTAT = wrap.reshape_PATSTAT_data(data)
        table_PATSTAT = wrap.add_citations(data, table_PATSTAT)
        table_EP_full_text = wrap.reshape_EP_full_text_data(data)
        table_PATSTAT = wrap.merge_final_datasets(table_EP_full_text, table_PATSTAT)

        wrap.set_plotting_style()
        proportion_of_claims_available = wrap.plot_claim_data_availability(table_PATSTAT)
        return proportion_of_claims_available
    
    def get_model_stats(step,mod):
        """Display summary statistics about the patents remaing in the data at each cleaning step"""

        # numbers of patents in the dataset
        a = len(mod.data['_table_main_patent_infos'][Config.VAR_APPLN_ID].unique().tolist())
        #print('..Nb of patents:',a)

        # number of patent families in the dataset
        b = len(mod.data['_table_main_patent_infos'][Config.VAR_DOCDC_FAMILY_ID].unique().tolist())
        #print('..Nb of patent families',b)

        # average family citation per year count
        c = mod.data['_table_main_patent_infos'][Config.NEW_VAR_NB_CITING_DOCDB_FAM_BY_YEAR].mean()
        #print('..Average count of family citation per year:',c)

        # availability of the patent claim data
        df = mod.data.copy()

        d = wrap.display_claims_availability(df)
        #print('..Claim text availability', d)

        return step,a, b, round(c,2) , str(round(d,2))+'%'
    
    def create_table_data_cleaning(model):

        rows = []

        rows.append(wrap.get_model_stats("Input dataset", model))


        # step1
        model = DataCleaning._keep_only_EP_patents(model)
        rows.append(wrap.get_model_stats('EP patents only', model))
        # step 2
        model = DataCleaning._select_time_range(model)
        rows.append(wrap.get_model_stats('Select time range', model))
        # step 3
        model = DataCleaning._keep_only_granted_patents(model)
        rows.append(wrap.get_model_stats('Granted patents only', model))
        # step 4
        model = DataCleaning._select_one_patent_per_family(model)
        rows.append(wrap.get_model_stats('One patent per family', model))
        # step 5
        model = DataCleaning._select_breakthrough_patents(model)
        rows.append(wrap.get_model_stats('Breakthrough patents', model))

        df = pd.DataFrame(rows, columns=["step","nb_patents", "nb_patent_fam", "av_count_fam_cit_per_year", "claim_text_availability"])

        return df        

In [8]:
from IPython.display import HTML
#HTML(wrap.create_table_data_cleaning(model).to_html(index=False))

> **Add to the table above:**
- Nb of clusters identified
- Modularity
- Content of the clusters

In [9]:
# step1
model = DataCleaning._keep_only_EP_patents(model)
# step 2
#model = DataCleaning._select_time_range(model)
# step 3
#model = DataCleaning._keep_only_granted_patents(model)
# step 4
#model = DataCleaning._select_one_patent_per_family(model)
# step 5
#model = DataCleaning._select_breakthrough_patents(model)

In [10]:
# data cleaning
#model._data_cleaning()

In [None]:
# reshape in an OOP manner before building the network
model._fit_to_object_oriented_design()

In [None]:
# retrieving all types of citations
model._get_citations()

In [None]:
# Search in the full text database the patents of interest
model._get_full_text()

In [None]:
# Text preprocessing, construction of the feature space and computation of all pairwise similarities
model._text_preprocessing()

In [None]:
# builing the NLP-based patent network
model._build_patent_network()

In [None]:
Visualisation._draw_graph_with_communities(model);   

In [None]:
Visualisation._display_cluster_word_clouds(model);