### Notebook 6 - Summary: correlations of node centralities' ranks in technology graphs and GDPpc growth

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.stats import pearsonr, spearmanr, ttest_ind, mannwhitneyu

seed = 100

In [2]:
centrality_names = {
    'betweenness_centr_uw' : 'Betweenness (UW)',
    'betweenness_centr' : 'Betweenness (W)',
    'pagerank_centr_uw' : 'PageRank (UW)',
    'degree_centr' : 'Degree (W)',
    'pagerank_centr' : 'PageRank (W)',
    'eigenv_centr' : 'Eigenvector (W)',
    'eigenv_centr_uw' : 'Eigenvector (UW)',
    'closeness_centr' : 'Closeness (W)' 
}

In [3]:
def add_pvlaue_level(coef, pvalue):
    if pvalue <= 0.1 and pvalue > 0.05:
        asterisks = '*'
    elif pvalue <= 0.05 and pvalue > 0.01:
        asterisks = '**'
    elif pvalue <= 0.01:
        asterisks = '***'
    else:
        asterisks = ''
    return ('$' + str(coef) + '^{' + asterisks + '}$')

In [4]:
tech1_centr_path = '/Users/koshelev/Documents/lmu/thesis/2-centralities_computation/computed_centralities/technology1/'
tech2_centr_path = '/Users/koshelev/Documents/lmu/thesis/2-centralities_computation/computed_centralities/technology2/'

In [5]:
# all computed centrality names
centr_names = []
for filename in os.listdir(tech1_centr_path):
    centr_names.append(filename.replace('.csv', ''))
centr_names = sorted(centr_names, reverse=True)
print(centr_names)

['pagerank_centr_uw', 'pagerank_centr', 'eigenv_centr_uw', 'eigenv_centr', 'degree_centr', 'closeness_centr', 'betweenness_centr_uw', 'betweenness_centr']


In [6]:
# GDP p.c. data from PWT10
pwt = pd.read_csv('data/pwt10.csv')[['countrycode', 'year', 'rgdpo', 'pop']]
pwt['gdppc'] = pwt['rgdpo'] / pwt['pop']
pwt.drop(columns=['rgdpo', 'pop'], inplace=True)
pwt = pwt.pivot(index='year', columns='countrycode', values='gdppc')
pwt.sort_values(by='year', inplace=True)

In [7]:
# check how many countries do we have in different dimension pairs

### technology 1 & GDPpc
gdp_set = set(pwt.columns)
tech1_centr = pd.read_csv(f'{tech1_centr_path}{centr_names[3]}.csv', index_col=0).fillna(method='ffill')
tech1_set = set(tech1_centr.columns)
print(f'countries in GDPpc dataframe: {len(gdp_set)}')
print(f'countries in tech.1 centr. dataframe: {len(tech1_set)}')
print(f'intersection in GDPpc and tech.1: {len(gdp_set.intersection(tech1_set))}')
print(f'intersection rate: {len(gdp_set.intersection(tech1_set)) / min(len(gdp_set), len(tech1_set))}')
print(' ')
### technology 2 & GDPpc
tech2_centr = pd.read_csv(f'{tech2_centr_path}{centr_names[3]}.csv', index_col=0).fillna(method='ffill')
tech2_set = set(tech2_centr.columns)
print(f'countries in GDPpc dataframe: {len(gdp_set)}')
print(f'countries in tech.2 centr. dataframe: {len(tech2_set)}')
print(f'intersection in GDPpc and tech.2: {len(gdp_set.intersection(tech2_set))}')
print(f'intersection rate: {len(gdp_set.intersection(tech2_set)) / min(len(gdp_set), len(tech2_set))}')

countries in GDPpc dataframe: 183
countries in tech.1 centr. dataframe: 160
intersection in GDPpc and tech.1: 137
intersection rate: 0.85625
 
countries in GDPpc dataframe: 183
countries in tech.2 centr. dataframe: 160
intersection in GDPpc and tech.2: 137
intersection rate: 0.85625


In [8]:
tech1_years = [1963, 1975, 1990]
tech2_years = [1975, 1990, 1999]

In [9]:
tech1_gdp_interc = list(gdp_set.intersection(tech1_set))
tech2_gdp_interc = list(gdp_set.intersection(tech2_set))

In [10]:
# technology 1 and GDPpc growth 
n = 20
all_corr1 = []
for tech1_year in tech1_years:
    year_corr = []
    for centr in centr_names:
        tech_centr_df = pd.read_csv(f'{tech1_centr_path}{centr}.csv', index_col=0)
        tech_array = tech_centr_df[tech1_gdp_interc].rank(axis=1).loc[tech1_year]
        gdppc_array = pwt[tech1_gdp_interc].loc[2019] - pwt[tech1_gdp_interc].loc[tech1_year+1]
        common_df = pd.concat([tech_array, gdppc_array], axis=1)
        common_df.columns = ['centr_rank', 'gdppc_change']
        common_df = common_df.sort_values(by='centr_rank', ascending=False).dropna()
        top_n_gdppc_gr = common_df['gdppc_change'].iloc[:n].values
        bottom_n_gdppc_gr = common_df['gdppc_change'].iloc[-n:].values
        diff = round(np.mean(top_n_gdppc_gr) - np.mean(bottom_n_gdppc_gr))
        stat, pvalue = mannwhitneyu(top_n_gdppc_gr, 
                                    bottom_n_gdppc_gr, 
                                    nan_policy='omit', 
                                    #random_state=seed, 
                                    alternative='two-sided')
        year_corr.append(add_pvlaue_level(coef=diff, pvalue=np.round(pvalue, 5)))
    all_corr1.append(year_corr)

In [11]:
tech1_gdppc = pd.DataFrame(all_corr1).transpose()
tech1_gdppc.index = list(map(lambda x: centrality_names[x], centr_names))
tech1_gdppc.columns = ['F-L(1963) - GDP p.c. gr. (1964-2019)', 
                        'F-L(1975) - GDP p.c. gr. (1976-2019)', 
                        'F-L(1990) - GDP p.c. gr. (1991-2019)']

In [12]:
# technology 2 and GDPpc growth 
n = 20
all_corr2 = []
for tech2_year in tech2_years:
    year_corr = []
    for centr in centr_names:
        tech_centr_df = pd.read_csv(f'{tech2_centr_path}{centr}.csv', index_col=0)
        tech_array = tech_centr_df[tech2_gdp_interc].rank(axis=1).loc[tech2_year]
        gdppc_array = pwt[tech2_gdp_interc].loc[2019] - pwt[tech2_gdp_interc].loc[tech2_year+1]
        common_df = pd.concat([tech_array, gdppc_array], axis=1)
        common_df.columns = ['centr_rank', 'gdppc_change']
        common_df = common_df.sort_values(by='centr_rank', ascending=False).dropna()
        top_n_gdppc_gr = common_df['gdppc_change'].iloc[:n].values
        bottom_n_gdppc_gr = common_df['gdppc_change'].iloc[-n:].values
        diff = round(np.mean(top_n_gdppc_gr) - np.mean(bottom_n_gdppc_gr))
        stat, pvalue = mannwhitneyu(top_n_gdppc_gr, 
                                  bottom_n_gdppc_gr, 
                                 nan_policy='omit', 
                                 #random_state=seed, 
                                 alternative='two-sided')
        year_corr.append(add_pvlaue_level(coef=diff, pvalue=np.round(pvalue, 5)))
    all_corr2.append(year_corr)

In [13]:
tech2_gdppc = pd.DataFrame(all_corr2).transpose()
tech2_gdppc.index = list(map(lambda x: centrality_names[x], centr_names))
tech2_gdppc.columns = ['B-L(1975) - GDP p.c. gr. (1976-2019)', 
                        'B-L(1990) - GDP p.c. gr. (1991-2019)', 
                        'B-L(1999) - GDP p.c. gr. (2000-2019)']

In [14]:
tech_gdppc = pd.concat([tech1_gdppc, tech2_gdppc], axis=1)

In [15]:
tech_gdppc

Unnamed: 0,F-L(1963) - GDP p.c. gr. (1964-2019),F-L(1975) - GDP p.c. gr. (1976-2019),F-L(1990) - GDP p.c. gr. (1991-2019),B-L(1975) - GDP p.c. gr. (1976-2019),B-L(1990) - GDP p.c. gr. (1991-2019),B-L(1999) - GDP p.c. gr. (2000-2019)
PageRank (UW),$26473^{***}$,$26376^{***}$,$25934^{***}$,$24822^{***}$,$17244^{***}$,$6598^{***}$
PageRank (W),$28784^{***}$,$19029^{***}$,$26510^{***}$,$25051^{***}$,$17844^{***}$,$7267^{***}$
Eigenvector (UW),$26634^{***}$,$26933^{***}$,$29961^{***}$,$25035^{***}$,$17936^{***}$,$6801^{***}$
Eigenvector (W),$29813^{***}$,$19870^{***}$,$29716^{***}$,$25932^{***}$,$16986^{***}$,$7856^{***}$
Degree (W),$25575^{***}$,$26565^{***}$,$14468^{***}$,$38323^{***}$,$17608^{***}$,$5963^{***}$
Closeness (W),$27774^{***}$,$17791^{***}$,$31371^{***}$,$25813^{***}$,$17585^{***}$,$6801^{***}$
Betweenness (UW),$14121^{***}$,$25593^{***}$,$22563^{***}$,$32436^{***}$,$24854^{***}$,$8646^{}$
Betweenness (W),$13451^{***}$,$19029^{***}$,$10454^{***}$,$45264^{***}$,$6892^{***}$,$5766^{**}$


In [16]:
# technology 1 and GDPpc growth - pct change
n = 20
all_corr1 = []
for tech1_year in tech1_years:
    year_corr = []
    for centr in centr_names:
        tech_centr_df = pd.read_csv(f'{tech1_centr_path}{centr}.csv', index_col=0)
        tech_array = tech_centr_df[tech1_gdp_interc].rank(axis=1).loc[tech1_year]
        gdppc_array = pwt[tech1_gdp_interc].loc[2019] / pwt[tech1_gdp_interc].loc[tech1_year+1] - 1
        common_df = pd.concat([tech_array, gdppc_array], axis=1)
        common_df.columns = ['centr_rank', 'gdppc_change']
        common_df = common_df.sort_values(by='centr_rank', ascending=False).dropna()
        top_n_gdppc_gr = common_df['gdppc_change'].iloc[:n].values
        bottom_n_gdppc_gr = common_df['gdppc_change'].iloc[-n:].values
        diff = round(np.mean(top_n_gdppc_gr) - np.mean(bottom_n_gdppc_gr), 3)
        stat, pvalue = mannwhitneyu(top_n_gdppc_gr, 
                                    bottom_n_gdppc_gr, 
                                    nan_policy='omit', 
                                    #random_state=seed, 
                                    alternative='two-sided')
        year_corr.append(add_pvlaue_level(coef=diff, pvalue=np.round(pvalue, 5)))
    all_corr1.append(year_corr)

In [17]:
tech1_gdppc = pd.DataFrame(all_corr1).transpose()
tech1_gdppc.index = list(map(lambda x: centrality_names[x], centr_names))
tech1_gdppc.columns = ['F-L(1963) - GDP p.c. gr. (1964-2019)', 
                        'F-L(1975) - GDP p.c. gr. (1976-2019)', 
                        'F-L(1990) - GDP p.c. gr. (1991-2019)']

In [18]:
# technology 2 and GDPpc growth 
n = 20
all_corr2 = []
for tech2_year in tech2_years:
    year_corr = []
    for centr in centr_names:
        tech_centr_df = pd.read_csv(f'{tech2_centr_path}{centr}.csv', index_col=0)
        tech_array = tech_centr_df[tech2_gdp_interc].rank(axis=1).loc[tech2_year]
        gdppc_array = pwt[tech2_gdp_interc].loc[2019] / pwt[tech2_gdp_interc].loc[tech2_year+1] - 1
        common_df = pd.concat([tech_array, gdppc_array], axis=1)
        common_df.columns = ['centr_rank', 'gdppc_change']
        common_df = common_df.sort_values(by='centr_rank', ascending=False).dropna()
        top_n_gdppc_gr = common_df['gdppc_change'].iloc[:n].values
        bottom_n_gdppc_gr = common_df['gdppc_change'].iloc[-n:].values
        diff = round(np.mean(top_n_gdppc_gr) - np.mean(bottom_n_gdppc_gr), 3)
        stat, pvalue = mannwhitneyu(top_n_gdppc_gr, 
                                  bottom_n_gdppc_gr, 
                                 nan_policy='omit', 
                                 #random_state=seed, 
                                 alternative='two-sided')
        year_corr.append(add_pvlaue_level(coef=diff, pvalue=np.round(pvalue, 5)))
    all_corr2.append(year_corr)

In [19]:
tech2_gdppc = pd.DataFrame(all_corr2).transpose()
tech2_gdppc.index = list(map(lambda x: centrality_names[x], centr_names))
tech2_gdppc.columns = ['B-L(1975) - GDP p.c. gr. (1976-2019)', 
                        'B-L(1990) - GDP p.c. gr. (1991-2019)', 
                        'B-L(1999) - GDP p.c. gr. (2000-2019)']

In [20]:
tech_gdppc_pct = pd.concat([tech1_gdppc, tech2_gdppc], axis=1)

In [21]:
tech_gdppc_pct

Unnamed: 0,F-L(1963) - GDP p.c. gr. (1964-2019),F-L(1975) - GDP p.c. gr. (1976-2019),F-L(1990) - GDP p.c. gr. (1991-2019),B-L(1975) - GDP p.c. gr. (1976-2019),B-L(1990) - GDP p.c. gr. (1991-2019),B-L(1999) - GDP p.c. gr. (2000-2019)
PageRank (UW),$1.489^{}$,$0.093^{**}$,$-0.57^{**}$,$-0.061^{}$,$-1.045^{}$,$-0.417^{}$
PageRank (W),$2.448^{*}$,$0.068^{**}$,$-0.415^{}$,$0.028^{}$,$-1.159^{}$,$-0.643^{}$
Eigenvector (UW),$0.652^{}$,$0.07^{**}$,$-0.564^{*}$,$-0.026^{}$,$-1.047^{}$,$-0.514^{}$
Eigenvector (W),$2.586^{**}$,$-0.055^{}$,$-0.33^{}$,$0.142^{}$,$-1.308^{*}$,$-0.634^{}$
Degree (W),$0.195^{}$,$0.005^{*}$,$-0.713^{**}$,$-0.373^{}$,$-0.679^{}$,$-0.886^{***}$
Closeness (W),$1.982^{}$,$-0.376^{}$,$-0.457^{*}$,$0.116^{}$,$-1.057^{}$,$-0.514^{}$
Betweenness (UW),$-2.718^{}$,$-0.425^{}$,$-0.454^{}$,$-0.646^{}$,$-1.233^{*}$,$-0.602^{**}$
Betweenness (W),$-2.629^{}$,$-0.384^{}$,$-0.403^{}$,$0.695^{}$,$-1.096^{**}$,$-0.731^{}$
