# Main model

In [1]:
# data
import pandas as pd
import math

## Configuration

In [2]:
class Config:
    """Configuration"""
    
    # Magic numbers
    LAST_YEAR_TO_RECEIVE_CITAITONS = 2018
    PERCENTAGE_TOP_PATENTS = 0.01

    # PASTAT_variables 
    VAR_APPLN_ID = 'appln_id'
    VAR_DOCDC_FAMILY_ID = 'docdb_family_id'
    VAR_CITED_DOCDB_FAM_ID = 'cited_docdb_family_id'
    VAR_APPLN_FILLING_YEAR = 'appln_filing_year'
    VAR_NB_CITING_DOCDB_FAM = 'nb_citing_docdb_fam'
    VAR_EARLIEST_FILLING_DATE = 'earliest_filing_date'
    VAR_EARLIEST_FILING_YEAR = 'earliest_filing_year'

    # Computed variables
    NEW_VAR_CITING_DOCDB_FAM_IDS = 'citing_docdb_families_ids'
    NEW_VAR_NB_CITING_DOCDB_FAM_BY_YEAR = 'nb_citing_docdb_fam_by_year'

## Load data

### PATSTAT data

In [3]:
# location of the PATSTAT data previously retrieved with the data_extraction_from_PATSTAT.ipynb notebook
output_files_prefix = "wind_tech_1990_2020_with_publications"
pre = '../data/raw/' + output_files_prefix
suf = '.csv'

In [4]:
# For convenience, we store all the data retrieved into a data object.
data = {'_table_main_patent_infos': pd.read_csv(pre + '_table_main_patent_infos' + suf, low_memory=False),
       '_table_cpc': pd.read_csv(pre + '_table_cpc' + suf, low_memory=False), 
       '_table_patentees_info': pd.read_csv(pre + '_table_patentees_info' + suf, low_memory=False),
       '_table_backward_docdb_citations': pd.read_csv(pre + '_table_backward_docdb_citations' + suf, low_memory=False),
       '_table_forward_docdb_citations': pd.read_csv(pre + '_table_forward_docdb_citations' + suf, low_memory=False)}

## Data cleaning

In [5]:
class DataCleaning:
    """Data cleaning methods"""
    
    def __init__():
        pass
    
    
    def _correct_JP_data(self):
        """Correction for Japanese patent data, in line with the literature"""
        # Do # Update the list of ids
        self = self.__update_patent_fam_ids() # Storing ids and filtering datasets
        return self
    
    
    #def _keep_only_EP_patents(self):
    #    """We filter the data to keep only EU patents (not only EP)"""
    #    
    #    # Local variables for simplicity
    #    df_main = self.data['_table_main_patent_infos']
    #    condition = df_main[''].isin(Config.EU_authorities)
    #    df_main = df_main[condition]
    #    self = self.__update_patent_fam_ids() # Storing ids and filtering datasets
    #    return self
    
    def _normalise(self):
        """Normalisation of the data accross years and sectors, to cater for **patent explosion**"""
        # Do # Update the list of ids
        self = self.__update_patent_fam_ids() # Storing ids and filtering datasets
        return self
    
    
    def _select_one_patent_per_family(self):
        """In order to select only patent of interest, as well as
        saving computationnal power, we select only the earliest patent by
        family"""
        
        # Local variables for simplicity
        df_main = self.data['_table_main_patent_infos']
        df_cpc = self.data['_table_cpc']
        df_patentees = self.data['_table_patentees_info']
        
        # Filtering 
        df_main.sort_values(by = Config.VAR_EARLIEST_FILLING_DATE,inplace = True)
        df_main.drop_duplicates(subset = [Config.VAR_DOCDC_FAMILY_ID],
                                keep = 'first',
                                inplace = True)
        
        # Storing ids and filtering datasets
        self = self.__update_patent_fam_ids()   
        return self
    

    def _select_breakthrough_patents(self):
        """Filtering the data to keep only breakthrough patents"""
        
        # Unpacking some variables for clarity
        X = Config.PERCENTAGE_TOP_PATENTS
        df = self.data['_table_main_patent_infos']
        
        # Selection  of the top patents
        filtered_df = pd.DataFrame()
        for year in df[Config.VAR_EARLIEST_FILING_YEAR].unique().tolist():
            df_year = df[df[Config.VAR_EARLIEST_FILING_YEAR] == year]
            df_year.sort_values(by = Config.VAR_NB_CITING_DOCDB_FAM,
                                ascending = False,
                                inplace = True)
            nb_top_patent_given_year = int(math.ceil(X*len(df_year))) # Needs rounding up
            df_year = df_year.head(nb_top_patent_given_year)
            filtered_df = pd.concat([filtered_df, df_year])
            
        # Update the table and the list of patent/fam ids
        self.data['_table_main_patent_infos'] = filtered_df
        
        # Storing ids and filtering datasets
        self = self.__update_patent_fam_ids()
        return self
    
    
    def __update_patent_fam_ids(self):
        """
        Storing patents ids and family ids and filtering the datasets
        # Filtering the first 3 datasets on the list of patent ids 
        # Filtering the other 2 datasets on the list of family ids
        """
        
        # (1) Update the list of ids (patent ids and family ids)
        df_main = self.data['_table_main_patent_infos']
        self.patent_ids = df_main[Config.VAR_APPLN_ID].unique().tolist()
        self.patent_family_ids = df_main[Config.VAR_DOCDC_FAMILY_ID].unique().tolist()
        
        # (2) Filter the tables according to the new list of patent ids
        def __filter(df, var, list_ids):
            """Code snippet to filter a dataset according to a list of ids"""
            condition = df[var].isin(list_ids)
            return df[condition]
        
        for key in self.data:
            if key in ['_table_main_patent_infos','_table_cpc','_table_patentees_info']:
                self.data[key] = __filter(self.data[key], Config.VAR_APPLN_ID, self.patent_ids)
            elif key in ['_table_backward_docdb_citations','_table_forward_docdb_citations']:
                self.data[key] = __filter(self.data[key], Config.VAR_DOCDC_FAMILY_ID, self.patent_family_ids)
        
        return self

## New metrics

In [6]:
class NewMetrics:
    """Methods to derive new metrics from the data"""
    
    def __init__():
        pass
    
    
    def _get_DOCDB_fam_cites_per_year(self):
        """Adding a variable to keep track of yearly citations by patent family"""
        
        # Unpacking some variables for clarity
        df = self.data['_table_main_patent_infos']
        citations_by_year = Config.NEW_VAR_NB_CITING_DOCDB_FAM_BY_YEAR
        citations_docdb_fam = Config.VAR_NB_CITING_DOCDB_FAM
        year = Config.VAR_APPLN_FILLING_YEAR
        ref_year = Config.LAST_YEAR_TO_RECEIVE_CITAITONS
        
        # Compute the metric
        df[citations_by_year] = df[citations_docdb_fam]/(ref_year-df[year])
        
        # Updating the table
        self.TABLE_ALL_PATENTS_INFO = df 
        return self

## Patent object

In [7]:
# We create a patent object. Since the patent will have a long list of attributes, 
# we stored their attributes in a dictionnary. As a shortcut, we store the main patent key 
# appln_id as an attribute direclty accesible with patent.appln_id.


class Patent:
    
    def __init__(self, appln_id):
        """Setting the patent parameters"""
        
        self.appln_id:int # as a shortcut we  store the main patent key
        self.patent_attributes = {} # Contains the list of the patent's attributes
        
        # Set instance attributes
        self.patent_attributes.update({Config.VAR_APPLN_ID :  appln_id})
        self.appln_id = appln_id 

## Reshaping to OOP

In [8]:
# We define a set of methods to reshape the data from the tabular form (as extracted from PATSTAT)
# to an object oriented form, where patents are identified and attributes attributed to them.

class ReshapingToOOP:
    """Methods to assign the data to patent objects"""
    
    def __init__(self):
        pass
    
    
    def _create_patent_objects(self):
        """
        Create a Patent object for each patent id and store them in a list
        """
        self.patent_list = []
        for patent_id in list(self.patent_ids):
            a = Patent(patent_id)
            self.patent_list.append(a)
            
        return self
    
    
    def _assign_data_to_patent_obj(self):
        """
        Once the data has been retrieved from PATSTAT and the patent objects
        have been created, we assign the data to the Patent objects
        """
        
        def __snippet_store_patent_attributes(table):
            """
            Code snippet to dynamically store attributes 
            from a Pandas table in a dictionnary
            # If a value has several values, then ts stored in a list
            """
            a = {}
            for col in list(table):
                key = col
                value = table[col].unique().tolist()#[0]
                value = [x for x in value if (x == x)!=False] # new line
                if len(value) == 1:
                    value = value[0]
                a[key] = value
            return a
        
        # Unpacking some variables
        df_main = self.data['_table_main_patent_infos']
        df_cpc = self.data['_table_cpc']
        df_patentee = self.data['_table_patentees_info']
        df_bwd = self.data['_table_backward_docdb_citations']
        df_fwd = self.data['_table_forward_docdb_citations']
        
        # (1) Assigning the data contained in the main table to the patent
        # We merge backward citation data to the main table (on family id)
        key = Config.VAR_DOCDC_FAMILY_ID
        df_main = pd.merge(df_main, df_bwd,how = 'left',left_on = key,right_on = key)
        
        for patent in self.patent_list:                
            for df in [df_main, df_cpc, df_patentee]:  
                patent_table = df[df[Config.VAR_APPLN_ID]==patent.appln_id]
                d = __snippet_store_patent_attributes(table = patent_table)
                patent.patent_attributes.update(d)
        
        # (2) Assigning forward citations to the patents      
        df_fwd.columns = ['A','B','C'] # Random column names
        for patent in self.patent_list:
            patent_fam_table = df_fwd[df_fwd['A']==patent.patent_attributes[Config.VAR_DOCDC_FAMILY_ID]]
            citing_fam = patent_fam_table['B'].unique().tolist()
            patent.patent_attributes.update({Config.NEW_VAR_CITING_DOCDB_FAM_IDS :citing_fam})
            
        return self

## Get citations
We use the similarity measure to link the patents in the network. We use direct and indirect citation links: 􏰀
* Direct backwards citation (at the patent family level); 􏰀
* Co-citations (CC);
* Biographic coupling (BC);
* Longitudinal coupling (LC).

In [9]:
class GetCitations:
    """Methods to compute direct and indirect (BC, CC, LC) citations between the patents"""
        
    def _get_direct_citations(self):
        """Get direct backwards citations (at the level of the family level)"""
        
        # Unpacking some varibles for clarity
        fam = Config.VAR_DOCDC_FAMILY_ID
        cited_fam = Config.VAR_CITED_DOCDB_FAM_ID
            
        # (1) If a patent cites only one family
        list1 = [(x,y) for x in self.patent_list for y in self.patent_list \
                 if y.patent_attributes[fam] == x.patent_attributes[cited_fam]]
        
        # (2) If the patent cites several families (then stored as list)
        list2 = [(x,y) for x in self.patent_list for y in self.patent_list \
                 if type(x.patent_attributes[cited_fam]) ==list \
                 if y.patent_attributes[fam] in x.patent_attributes[cited_fam]]
        
        # Concatenating the two lists to have the direct citations
        self.direct_citations = list1 + list2
        return self
        
        
    def _get_BC_citations(self):
        """
        # (1) Bibliographic coupling occurs when two works reference a common third work
        # (2) The produced list is non directed.
        # (3) Can be optimised
        """
            
        # Definition of variables
        BC = []
        a = self.patent_list
        all_patent_pairs = [(a[p1], a[p2]) for p1 in range(len(a)) for p2 in range(p1+1,len(a))]

        # Computing BC by looping over all pairs of patents
        for patent_1, patent_2 in all_patent_pairs:
            list_citing_1 = patent_1.patent_attributes[Config.NEW_VAR_CITING_DOCDB_FAM_IDS]
            list_citing_2 = patent_2.patent_attributes[Config.NEW_VAR_CITING_DOCDB_FAM_IDS]
            common_elements = [x for x in list_citing_1 if x in list_citing_2]
            if len(common_elements)>0:
                BC.append((patent_1, patent_2))
            
        # Removing duplicated items in the list
        self.BC = list(set(BC)) 
        return self
        
        
    def _get_CC_citations(self):
        """
        # (1) Co-citation is defined as the frequency with which two documents are cited together
        by other documents. If at least one other document cites two documents in common these documents
        are said to be co-cited
        # (2) The produced list is non directed
        """
        CC = []
            
        # Definition of all patent pairs
        a = self.patent_list
        all_patent_pairs = [(a[p1], a[p2]) for p1 in range(len(a)) for p2 in range(p1+1,len(a))]
            
        # Definition of the search algorithm
        for patent in self.patent_list:
            a = patent.patent_attributes[Config.VAR_CITED_DOCDB_FAM_ID]
            if type(a)==list:
                if len(a)>1:
                    all_cited_patent_pairs = [(a[p1], a[p2]) \
                                              for p1 in range(len(a)) \
                                              for p2 in range(p1+1,len(a))]
                    for pair in all_cited_patent_pairs:
                        CC.append(pair)
        
        pairs = list(set(CC)) 
        
        CC = []
        for pair in pairs:
            patent1 = [patent \
                       for patent in self.patent_list \
                       if patent.patent_attributes[Config.VAR_DOCDC_FAMILY_ID] == pair[0]]
            patent2 = [patent \
                       for patent in self.patent_list \
                       if patent.patent_attributes[Config.VAR_DOCDC_FAMILY_ID] == pair[1]]

            if len(patent1)>0:
                patent1 = patent1[0]
            else: patent1=np.nan

            if len(patent2)>0:
                patent2 = patent2[0]
            else: patent2=np.nan

            pair = (patent1, patent2)
            CC.append(pair)

        self.CC = [pair for pair in CC if (pair[0]==pair[0]) & (pair[1] == pair[1])]
        return self
     
        
    def _get_LC_citations(self):
        """
        # (1) LC (longitudinal coupling). A cites a document that cites B
        # (2) The produced list IS directed 
        # (3) Can be optimised
        """          
        LC = []
            
        # Identifying all patents cited by a given patent A
        for patent_A in self.patent_list:
            cited_fam = patent_A.patent_attributes[Config.VAR_CITED_DOCDB_FAM_ID]
            if type(cited_fam)==float:
                    cited_fam = []
                    cited_fam.append(patent_A.patent_attributes[Config.VAR_CITED_DOCDB_FAM_ID])
            cited_patents = [patent \
                             for patent in self.patent_list \
                             if patent.patent_attributes[Config.VAR_DOCDC_FAMILY_ID] in cited_fam]
                
            # Identifying all patents cited by a patent cited by patent A
            for cited_patent in cited_patents:
                cited_fam = cited_patent.patent_attributes[Config.VAR_CITED_DOCDB_FAM_ID]
                if type(cited_fam)==float:
                    cited_fam = []
                    cited_fam.append(cited_patent.patent_attributes[Config.VAR_CITED_DOCDB_FAM_ID])
                cited_cited_patents = [patent \
                                       for patent in self.patent_list \
                                       if patent.patent_attributes[Config.VAR_DOCDC_FAMILY_ID] in cited_fam]
                    
                # Adding the pairs in the LC list
                for patent_B in cited_cited_patents:
                    LC.append((patent_A, patent_B))
                    
        # Removing duplicated items in the LC list
        self.LC = list(set(LC)) 
        return self

## Text processing

- stemming
- vectorisation with TF-IDF
- measure cosine similarity

In [10]:
class TextProcessing:
    """
    Methods for text analysis and similarity measures
    For the sake of computational power, we use these methods for patents in a citations pair only 
    """
    
    def _stemming():
        """Reducing words to their stem word (semantic root)"""
        pass
    
    def _vectorize():
        """Vectorise the patents in a high dimention space"""
        pass
    
    def _similarity(p1, p2):
        """Measure the similiarity between a pair of linked patents pair = (p1, p2)"""
        return 1 # for testing

## Building the nlp-based patent network

In [11]:
class BuildNetwork():
    """Builds a weighted network based on backwards citations and text similarity"""
    
    def _create_network(self):
        """Create the weighted and undirected network with igraph"""
        
        # defining all possible links between any pair of patents
        links = self.direct_citations + self.CC + self.BC + self.LC
        
        def filter_symmetric_duplicates(l:list):
            """Code snippet to filter symmetric duplicates in a list of tuples
            Eg [(1,2), (2,1)] -> [(1,2)]"""
            seen = []
            for pair in l:
                if pair in seen:
                    l.remove(pair)
                seen.append(tuple(reversed(pair)))
            return l
        
        # definition of the links
        links = filter_symmetric_duplicates(links)
        weighted_links = [(p1, p2, TextProcessing._similarity(p1, p2)) for (p1, p2) in links]
        # creation of the graph
        self.graph = Graph.TupleList(weighted_links, weights=True)
        
        return self

## Summary statistics (work in progress)

In [12]:
class SummaryStatistics:
    """Summary statistics for the data section"""
    # Can also help comparing before data cleaning and after!
    
    def _print_nb_patents(self):
        """Printing info"""
        print('..Nb of patents:',len(self.data['_table_main_patent_infos']\
                                     [Config.VAR_APPLN_ID].unique().tolist()))

## Visualisation (work in progress)

In [13]:
class Visualisation:
    """Visualisation methods"""
    
    def _draw_graph_with_communities(self):
        comms = model.graph.community_multilevel()
        plot(comms, mark_groups = True)

## Core modelling

In [14]:
class Model(Config, DataCleaning, NewMetrics, ReshapingToOOP, GetCitations,
            TextProcessing, BuildNetwork, SummaryStatistics, Visualisation):
    """Creation of a model which inherits several building blocks"""
    
    # Attributes of the model
    
    def __init__(self):
        
        data: dict # datasets
        patent_list: list # patent objects
        patent_ids: list # list of patent ids contained in the model
        patent_family_ids: list # list of DOCDB family ids contained in the model
        direct_citations: list # directed list of simple citations
        CC: list # undirected list of co-citations
        BC: list # undirected list of bibliographical coupling
        LC: list # directed list of longitudinal citations
        graph: igraph.Graph # Igraph network
        
        pass
    
    
    def _input_data(self, data):
        """Getting the data in the model"""
        self.data = data
        
    
    def _compute_new_metrics(self):
        """Adding new variables in the dataset"""
        self = NewMetrics._get_DOCDB_fam_cites_per_year(self)
    
        
    def _data_cleaning(self):
        """Data cleaning using the DataCleaning class methods"""
        self = DataCleaning._correct_JP_data(self)
        self = DataCleaning._normalise(self)
        self = DataCleaning._select_one_patent_per_family(self)
        self = DataCleaning._select_breakthrough_patents(self)
        
        
    def _fit_to_object_oriented_design(self):
        """We reshape the data from a tabular form to an object oriented form"""
        self = ReshapingToOOP._create_patent_objects(self)
        self = ReshapingToOOP._assign_data_to_patent_obj(self)
    
    
    def _get_citations(self):
        """Identify direct and indirect citations that link the patents"""
        self = GetCitations._get_direct_citations(self)
        self = GetCitations._get_CC_citations(self)
        self = GetCitations._get_BC_citations(self)
        self = GetCitations._get_LC_citations(self)
    
    
    def _compute_text_similarity(self):
        """Computing text similarities between linked patents"""
        pass
    
    
    def _build_patent_network(self):
        """We build the patent network (weighted directed graph)"""
        self = BuildNetwork._create_network(self)
    
    def visualise(self):
        """Plot"""
        self = Visualisation._draw_graph_with_communities(self)

## Results

In [15]:
# instantiation of the model
model = Model()
# fitting the model to the data
model._input_data(data)
# new metrics
model._compute_new_metrics()
# data cleaning
model._data_cleaning()
# reshape in an OOP manner before building the network
model._fit_to_object_oriented_design()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# Retrieving all types of citations
model._get_citations()
# Builing the NLP-based patent network
model._build_patent_network()
# Plot a visualisation
model._visualise()

In [33]:
for patent in model.patent_list:
    if patent.patent_attributes['appln_auth'] == 'DK':
        print(patent.patent_attributes['publn_nr'])
        print('----')

1132614
----
200101745
----
1417409
----
1314885
----
177769
----
2025929
----
1995860
----
2129908
----
2163761
----
2283233
----
2345811
----
178197
----
201270045
----
201200554
----
201470474
----
2799709
----
2801720
----


In [30]:
model.patent_list[0].patent_attributes

{'appln_id': 29571917,
 'index_x': 13412,
 'appln_id.1': [],
 'appln_auth': 'JP',
 'appln_nr': '8680890',
 'appln_kind': 'U ',
 'appln_filing_date': '1990-08-21',
 'appln_filing_year': 1990,
 'appln_nr_epodoc': 'JP19900086808U',
 'appln_nr_original': '1990086808',
 'ipr_type': 'UM',
 'receiving_office': '  ',
 'internat_appln_id': 0,
 'int_phase': 'N',
 'reg_phase': 'N',
 'nat_phase': 'Y',
 'earliest_filing_date': '1981-09-14',
 'earliest_filing_year': 1981,
 'earliest_filing_id': 18446242,
 'earliest_publn_date': '1991-04-11',
 'earliest_publn_year': 1991,
 'earliest_pat_publn_id': 394335074,
 'granted': 0,
 'docdb_family_id': 8514692,
 'inpadoc_family_id': 779343,
 'docdb_family_size': 5,
 'nb_citing_docdb_fam': 15,
 'nb_applicants': 0,
 'nb_inventors': 0,
 'appln_id.2': [],
 'appln_title_lg': [],
 'appln_title': [],
 'appln_id.3': [],
 'appln_abstract_lg': [],
 'appln_abstract': [],
 'appln_id.4': 29571917.0,
 'ipc_class_symbol': 'B63H  13/00',
 'ipc_class_level': 'A',
 'ipc_version