### Notebook 3 - technology graph construciton. Solution 1

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
import json
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
pat = pd.read_csv('data/technology_raw/apat63_99_iso3.csv')
cite = pd.read_csv('data/technology_raw/cite75_99.csv')

In [3]:
# patent dataframe with patent ids as indices
pat_idx = pat.set_index(pat['PATENT'])

In [4]:
for y in pd.unique(pat['GYEAR']):
    print(y)
    # selected year
    y_pat = pat.loc[pat['GYEAR'] == y].reset_index(drop=True)
    # all citations of granted patents in this year
    y_cited = cite.loc[np.isin(cite['CITED'].values, y_pat['PATENT'].values)].reset_index(drop=True)
    
    if y_cited.shape[0] > 0:
        # appending citing_country and cited_country
        ### before that, select cited patents that are present in the "pat" dataframe (filtering step)
        citing_all = y_cited.loc[np.isin(y_cited['CITED'].values, y_pat['PATENT'].values)]['CITING'].values
        citing_exist = citing_all[np.isin(citing_all, pat_idx.index)]
        y_cited = y_cited.loc[np.isin(y_cited['CITING'].values, citing_exist)]
        ###
        y_cited['citing_country'] = pat_idx.loc[y_cited.loc[np.isin(y_cited['CITED'].values, y_pat['PATENT'].values)]['CITING']]['COUNTRY_ISO3'].values
        y_cited['cited_country'] = pat_idx.loc[y_cited.loc[np.isin(y_cited['CITED'].values, y_pat['PATENT'].values)]['CITED']]['COUNTRY_ISO3'].values
        # determine unique citing countries for this year
        unique_citing = pd.unique(y_cited['citing_country']) 
        # initialize a directed graph
        graph = nx.DiGraph(year=int(y))
        # initialize citation dict to keep track of multiple observations for signle country
        citations_dict = {}
        # iterate through all unique citing countries and calculate sum of observations with other countries
        for citing in tqdm(unique_citing):
            y_cite_country = y_cited.loc[y_cited['citing_country']==citing]
            for pair_country in pd.unique(y_cite_country['cited_country']):
                y_cite_cc = y_cite_country.loc[y_cite_country['cited_country']==pair_country]
                if y_cite_cc.shape[0] > 0:
                    number_citations = y_cite_cc.shape[0]
                    country_pair = pair_country + '-' + citing
                    if country_pair not in citations_dict.keys():
                        citations_dict[country_pair] = number_citations
                    else:
                        citations_dict[country_pair] += number_citations
    
        for cc in citations_dict.keys():
            cc_list = cc.split('-')
            country_cited = cc_list[0]
            country_citing = cc_list[1]
            number_citations = citations_dict[cc]
            graph.add_edge(country_cited, country_citing)
            graph[country_cited][country_citing]['Number Citations'] = int(number_citations)

        with open(f'data/preprocessed/technology_graphs_1/technology_graph_{int(y)}.json', 'w') as file:
            json.dump(json_graph.node_link_data(graph), file, indent=4)

        del graph, y_pat, y_cited, citing_all, citing_exist, unique_citing, citations_dict

1963


  0%|          | 0/93 [00:00<?, ?it/s]

1964


  0%|          | 0/94 [00:00<?, ?it/s]

1965


  0%|          | 0/103 [00:00<?, ?it/s]

1966


  0%|          | 0/100 [00:00<?, ?it/s]

1967


  0%|          | 0/102 [00:00<?, ?it/s]

1968


  0%|          | 0/102 [00:00<?, ?it/s]

1969


  0%|          | 0/106 [00:00<?, ?it/s]

1970


  0%|          | 0/101 [00:00<?, ?it/s]

1971


  0%|          | 0/109 [00:00<?, ?it/s]

1972


  0%|          | 0/110 [00:00<?, ?it/s]

1973


  0%|          | 0/112 [00:00<?, ?it/s]

1974


  0%|          | 0/111 [00:00<?, ?it/s]

1975


  0%|          | 0/110 [00:00<?, ?it/s]

1976


  0%|          | 0/110 [00:00<?, ?it/s]

1977


  0%|          | 0/113 [00:00<?, ?it/s]

1978


  0%|          | 0/109 [00:00<?, ?it/s]

1979


  0%|          | 0/101 [00:00<?, ?it/s]

1980


  0%|          | 0/105 [00:00<?, ?it/s]

1981


  0%|          | 0/105 [00:00<?, ?it/s]

1982


  0%|          | 0/102 [00:00<?, ?it/s]

1983


  0%|          | 0/108 [00:00<?, ?it/s]

1984


  0%|          | 0/108 [00:00<?, ?it/s]

1985


  0%|          | 0/101 [00:00<?, ?it/s]

1986


  0%|          | 0/106 [00:00<?, ?it/s]

1987


  0%|          | 0/106 [00:00<?, ?it/s]

1988


  0%|          | 0/103 [00:00<?, ?it/s]

1989


  0%|          | 0/105 [00:00<?, ?it/s]

1990


  0%|          | 0/104 [00:00<?, ?it/s]

1991


  0%|          | 0/109 [00:00<?, ?it/s]

1992


  0%|          | 0/98 [00:00<?, ?it/s]

1993


  0%|          | 0/102 [00:00<?, ?it/s]

1994


  0%|          | 0/99 [00:00<?, ?it/s]

1995


  0%|          | 0/93 [00:00<?, ?it/s]

1996


  0%|          | 0/86 [00:00<?, ?it/s]

1997


  0%|          | 0/81 [00:00<?, ?it/s]

1998


  0%|          | 0/68 [00:00<?, ?it/s]

1999


  0%|          | 0/36 [00:00<?, ?it/s]