### Notebook 2 - centralities of technology graphs (solution 1)

In [1]:
from networkx.readwrite import json_graph
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import warnings
import json
import os

warnings.filterwarnings('ignore')
year_start = 1963
year_end = 1999
years = range(year_start, year_end + 1)
graph_folder = '/Users/koshelev/Documents/lmu/thesis/1-graph_construction/data/preprocessed/technology_graphs_1/'
output_folder = 'computed_centralities/technology1/'
weight = 'Number Citations'
seed = 100

In [2]:
# function to read graph in json format
def read_json_file(filename: str) -> nx.Graph:
    with open(filename) as f:
        js_graph = json.load(f)
    return json_graph.node_link_graph(js_graph)

In [3]:
# get all node names present in the dataset
unique_nodes = set()
for filename in os.listdir(graph_folder):
    if filename[-5:] == '.json':
        g = read_json_file(graph_folder + filename)
        nodes = set(g.nodes())
        unique_nodes = unique_nodes.union(nodes)

In [4]:
# pagerank centrality - unweighted
pagerank_df_uw = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    try:
        try:
            pagerank_centr = nx.pagerank(G=y_graph, weight=None, max_iter=10**3)
        except:
            pagerank_centr = nx.pagerank(G=y_graph, weight=None, max_iter=10**3, tol=1e-2)
        for col in pagerank_df_uw.columns:
            try:
                pagerank_df_uw.loc[y][col] = pagerank_centr[col]
            except:
                pass
    except:
        pass
    

# pagerank centrality - weighted
pagerank_df = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    try:
        try:
            pagerank_centr = nx.pagerank(G=y_graph, weight=weight, max_iter=10**3)
        except:
            pagerank_centr = nx.pagerank(G=y_graph, weight=weight, max_iter=10**3, tol=1e-2)
        for col in pagerank_df.columns:
            try:
                pagerank_df.loc[y][col] = pagerank_centr[col]
            except:
                pass
    except:
        pass

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

In [5]:
# degree centrality
degree_centr_df = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    degree_centr = nx.degree_centrality(G=y_graph)
    for col in degree_centr_df.columns:
        try:
            degree_centr_df.loc[y][col] = degree_centr[col]
        except:
            pass

  0%|          | 0/37 [00:00<?, ?it/s]

In [6]:
# eigenvector centrality - unweighted
eigenv_centr_df_uw = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    try:
        try:
            eigenv_centr = nx.eigenvector_centrality(G=y_graph, weight=None)
        except:
            eigenv_centr = nx.eigenvector_centrality(G=y_graph, weight=None, tol=1e-03)
        for col in eigenv_centr_df_uw.columns:
            try:
                eigenv_centr_df_uw.loc[y][col] = eigenv_centr[col]
            except:
                pass
    except:
        pass
    
    
# eigenvector centrality - weighted
eigenv_centr_df = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    try:
        try:
            eigenv_centr = nx.eigenvector_centrality(G=y_graph, weight=weight)
        except:
            eigenv_centr = nx.eigenvector_centrality(G=y_graph, weight=weight, tol=1e-03)
        for col in eigenv_centr_df.columns:
            try:
                eigenv_centr_df.loc[y][col] = eigenv_centr[col]
            except:
                pass
    except:
        pass

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

In [7]:
# closeness centrality
closeness_centr_df = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    closeness_centr = nx.closeness_centrality(G=y_graph)
    for col in closeness_centr_df.columns:
        try:
            closeness_centr_df.loc[y][col] = closeness_centr[col]
        except:
            pass

  0%|          | 0/37 [00:00<?, ?it/s]

In [8]:
# betweenness centrality - unweighted
betweenness_centr_df_uw = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    betweenness_centr = nx.betweenness_centrality(G=y_graph, seed=seed, weight=None)
    for col in betweenness_centr_df_uw.columns:
        try:
            betweenness_centr_df_uw.loc[y][col] = betweenness_centr[col]
        except:
            pass
        
        
# betweenness centrality - weighted
betweenness_centr_df = pd.DataFrame(index=years, columns=unique_nodes)
for y in tqdm(years):
    y_graph = read_json_file(graph_folder + f'technology_graph_{int(y)}.json')
    betweenness_centr = nx.betweenness_centrality(G=y_graph, seed=seed, weight=weight)
    for col in betweenness_centr_df.columns:
        try:
            betweenness_centr_df.loc[y][col] = betweenness_centr[col]
        except:
            pass

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

saving output files

In [9]:
pagerank_df.to_csv(output_folder + 'pagerank_centr.csv', index=True)
pagerank_df_uw.to_csv(output_folder + 'pagerank_centr_uw.csv', index=True)
degree_centr_df.to_csv(output_folder + 'degree_centr.csv', index=True)
eigenv_centr_df.to_csv(output_folder + 'eigenv_centr.csv', index=True)
eigenv_centr_df_uw.to_csv(output_folder + 'eigenv_centr_uw.csv', index=True)
closeness_centr_df.to_csv(output_folder + 'closeness_centr.csv', index=True)
betweenness_centr_df.to_csv(output_folder + 'betweenness_centr.csv', index=True)
betweenness_centr_df_uw.to_csv(output_folder + 'betweenness_centr_uw.csv', index=True)