In [1]:
notebook_id = "121"

<a id="ID_top"></a>
## Country data organisation

This workflow generates (part1), maintains (part2) and exports (part3)

#### Notebook sections:
    
|| [0| Default imports](#ID_top) || [1|Part1 Reference table generation](#ID_part1) || [2|Part2 Table maintanence](#ID_part2) || [3|Part3 Table export](#ID_part3) || 

#### Import all packages that could be required

In [2]:
# %load s_package_import.py
# package library, use to ensure consistency across notebooks, refresh periodically
# general packages
import os # use with os.listdir(_path_)
import requests
import csv
import time
from datetime import datetime
from shutil import copyfile

#temp check
#from shutil import make_archive
import zipfile #notebook

# data analysis packages
import pandas as pd
pd.options.display.max_columns = None # don't truncate columns
#pd.options.display.max_rows = None

import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import descartes
import pycountry
from sklearn import preprocessing

# custom scripts
import s_file_export
import s_filepaths
import s_un_comtrade_extract as s_un
import s_adj_matrix_plot

#=== network analysis
import networkx as nx
#=== gavity modelling
import gme as gme

#=== distance datasets
import wbdata


  import pandas.util.testing as tm


#### Import module and declare path variables
`import s_filepaths.py`

In [3]:
# import ref file
import s_filepaths

# declare local variables to work with
path_raw = s_filepaths.path_raw
path_raw_dl = s_filepaths.path_raw_dl
path_store = s_filepaths.path_store
path_live = s_filepaths.path_live

In [4]:
#os.listdir(path_livepath_live)

****

<a id="ID_part1"></a>
### Part 1 | Import all countries and set scope
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

**Import master country reference file**

In [4]:
# View all files in live directory
sorted(list(os.listdir(f"{path_live}")))

['.DS_Store',
 '112_input_gme_data.csv.gzip',
 '113_input_bri_members.csv.gzip',
 '120_input_scope_data.csv.gzip',
 '121_input_de_matrix.csv.gzip',
 '121_input_de_min_max.csv.gzip',
 '121_input_di_matrix.csv.gzip',
 '121_input_di_min_max.csv.gzip',
 '122_input_un_com_2016_mini.csv.gzip',
 '125_input_scope_country.csv.gzip',
 '2_raw_explainer_doc.md']

In [5]:
country_file = "120_input_scope_data.csv.gzip"
df_scope = pd.read_csv(f"{path_live}{country_file}",compression="gzip")
df_scope.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.8744,0,9693.0625,,
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0
4,2016,Czech Republic,CZE,CZE,1,0,europe,,,49.817062,15.696862,Holy See,VAT,VAT,1,0,europe,,41.900013,12.447808,0,925.0072,2015.0,1.0


In [6]:
# get country list
list_scope_countries = list(df_scope.iso3_d.unique())
len(list_scope_countries)

88

Check countries are present in the World Bank database.

In [7]:
def f_wb_iso3_check(p_iso3_list):
    wb_country_list = []
    # Check presence in WB database
    for iso3 in p_iso3_list:
        try:
            wbdata.api.get_country(iso3)
            wb_country_list.append(iso3)
        except:
            print(f"{iso3} gave error")

    print(f"\n{len(wb_country_list)} are included in the refined list.")
    return wb_country_list

In [8]:
list_wb_scope = f_wb_iso3_check(list_scope_countries)

GAZ gave error
VAT gave error

86 are included in the refined list.


In [10]:
# see world bank data sources
#wbdata.get_source()

# logistics at 66

<a id="ID_part2"></a>
### Part 2 | Download data
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

**This section extracts data for:**<br>
**A.** Economic distance (WDI) - source 2<br>
**B.** Institutional distance (WGI) - source 3
<br><br>
Both are sourced from the World Bank. 

In [20]:
# Master variables
target_date = datetime(2016,1,1)

**A | Economic Distance (DE) | Download**

In [54]:
# Economic distance indicators (DE) based on WDI
de_indicator = ["NY.GDP.PCAP.PP.CD"]
target_date = target_date

In [55]:
# Iterate over country and indicator to correctly flag what is available
outputs = []
list_excluded = []
df_wb_de = pd.DataFrame()

# for every country in list of available countries for our desired region
for country in list_wb_scope:
    
    # iterate through the indicators one by one
    try:
        for indicator in de_indicator:
            df_wb_de.loc[country,indicator] = wbdata.api.get_series(indicator = indicator, source = "2",
                                                     country = country,data_date=target_date)[0] 
    except:
        print(f"WARNING | Country: {country} data not available")
        list_excluded.append(country)

**B | Institutional Distance (DI) | Download**

In [56]:
# Insitutional distance indicators (DI) based on WGI
indicator_list = ["CC.EST","GE.EST","PV.EST","RL.EST","RQ.EST","VA.EST"]
target_date = target_date

In [57]:
# Iterate over country and indicator to correctly flag what is available
outputs = []
list_excluded_di = []
df_wb_di = pd.DataFrame()

# for every country in list of available countries for our desired region
for country in list_wb_scope:
    
    # iterate through the indicators one by one
    try:
        for indicator in indicator_list:
            df_wb_di.loc[country,indicator] = wbdata.api.get_series(indicator = indicator, source = "3",
                                                     country = country,data_date=target_date)[0] 
    except:
        print(f"WARNING | Country: {country} data not available")
        list_excluded_di.append(country)



**C | Logistics performance index (LPI) | Download**

In [12]:
# see indicators
wbdata.get_indicator(source = 66)
# all lpi indicators
lpi_indicator_list = ["LP.LPI.CUST.XQ","LP.LPI.INFR.XQ","LP.LPI.ITRN.XQ",
                      "LP.LPI.LOGS.XQ","LP.LPI.TIME.XQ","LP.LPI.TRAC.XQ",
                      "LP.LPI.OVRL.XQ","LP.LPI.OVRL.RK.UB","LP.LPI.OVRL.XQ.LB"]
# selected indicators
lpi_indicator_list = [lpi_indicator_list[6]]

In [50]:
# Get specific year
# Iterate over country and indicator to correctly flag what is available
outputs = []
list_excluded_lpi = []
df_wb_lpi = pd.DataFrame()

# for every country in list of available countries for our desired region
for country in list_wb_scope:
    
    # iterate through the indicators one by one
    try:
        for indicator in lpi_indicator_list:
            df_wb_lpi.loc[country,indicator] = wbdata.api.get_series(indicator = indicator, source = "66",
                                       data_date=target_date,country = country)[0] 
    except:
        print(f"WARNING | Country: {country} data not available")
        list_excluded_lpi.append(country)



In [63]:
# Get all years!
# Iterate over country and indicator to correctly flag what is available
outputs = []
list_excluded_lpi = []
df_wb_lpi_list = []

# for every country in list of available countries for our desired region
for country in list_wb_scope:
    
    # iterate through the indicators one by one
    try:
        for indicator in lpi_indicator_list:
            temp_df = pd.DataFrame(wbdata.api.get_series(indicator = indicator, source = "66",country = country))

            temp_df.columns = [country]
            temp_df = temp_df.transpose().copy()
            df_wb_lpi_list.append(temp_df)   
            
    except:
        print(f"WARNING | Country: {country} data not available")
        list_excluded_lpi.append(country)

# concatenate all columns
df_wb_lpi_all_years = pd.concat(df_wb_lpi_list)
df_wb_lpi_all_years.head()



date,2018,2016,2014,2012,2010,2007
ARE,3.956437,3.941767,3.539098,3.77844,3.63034,3.727581
AZE,,,2.448376,2.481118,2.639554,2.290998
BEL,4.039084,4.108538,4.04466,3.980262,3.942263,3.893764
CHN,3.605147,3.661104,3.531463,3.517017,3.489039,3.321935
CZE,3.68033,3.674309,3.492416,3.141498,3.506553,3.134626


**A+B | Merge dataframes**

In [61]:
# Merge data
df_wb_all = df_wb_di.merge(df_wb_de,left_index = True, right_index= True).copy()
df_wb_all.head()

# only keep entries with data
df_wb_all.dropna(inplace= True)
missing_country = set(list_wb_scope) - set(list(df_wb_all.index))
print(f"Original list = {len(list_wb_scope)} | Economic and Institutional data for = {len(df_wb_all)}")
print(f"Countries with no data are: {missing_country}")

# rename columns
df_wb_all.columns = ["Control of corruption","Government effectiveness","Political Stability and Absence of Violence",
                     "Rule of Law","Regulatory Quality","Voice and Accountability","GDP per capita (PPP)"]

df_wb_all.head()

Original list = 86 | Economic and Institutional data for = 77
Countries with no data are: {'GIB', 'YEM', 'SMR', 'FRO', 'PRK', 'MCO', 'AND', 'LIE', 'SYR'}


Unnamed: 0,Control of corruption,Government effectiveness,Political Stability and Absence of Violence,Rule of Law,Regulatory Quality,Voice and Accountability,GDP per capita (PPP)
ARE,1.171709,1.415928,0.564749,0.849041,0.972596,-1.050421,63968.853871
AZE,-0.838504,-0.165613,-0.803879,-0.519861,-0.282023,-1.563101,14371.022741
BEL,1.638705,1.326213,0.441367,1.390965,1.340684,1.376962,48608.957782
CHN,-0.254059,0.354875,-0.499395,-0.334415,-0.264475,-1.561287,13572.620725
CZE,0.536757,1.036739,0.976116,1.035502,0.986401,1.024769,35876.727438


**(A+B) + C | Second merging**

In [62]:
# Merge data
df_wb_complete = df_wb_all.merge(df_wb_lpi,left_index = True, right_index= True).copy()
#df_wb_all.head()

# only keep entries with data
df_wb_complete.dropna(inplace= True)
missing_country = set(list_wb_scope) - set(list(df_wb_complete.index))
print(f"Original list = {len(list_wb_scope)} | Economic and Institutional data for = {len(df_wb_all)}")
print(f"Countries with no data are: {missing_country}")

# rename columns
df_wb_complete.columns = ["Control of corruption","Government effectiveness","Political Stability and Absence of Violence",
                     "Rule of Law","Regulatory Quality","Voice and Accountability","GDP per capita (PPP)","Logistics Performance Index"]

print(df_wb_complete.shape)
df_wb_complete.head()

Original list = 86 | Economic and Institutional data for = 77
Countries with no data are: {'GIB', 'YEM', 'SMR', 'AZE', 'KSV', 'PSE', 'FRO', 'PRK', 'TLS', 'MCO', 'MAC', 'AND', 'LIE', 'SYR'}
(72, 8)


Unnamed: 0,Control of corruption,Government effectiveness,Political Stability and Absence of Violence,Rule of Law,Regulatory Quality,Voice and Accountability,GDP per capita (PPP),Logistics Performance Index
ARE,1.171709,1.415928,0.564749,0.849041,0.972596,-1.050421,63968.853871,3.941767
BEL,1.638705,1.326213,0.441367,1.390965,1.340684,1.376962,48608.957782,4.108538
CHN,-0.254059,0.354875,-0.499395,-0.334415,-0.264475,-1.561287,13572.620725,3.661104
CZE,0.536757,1.036739,0.976116,1.035502,0.986401,1.024769,35876.727438,3.674309
ESP,0.51563,1.115842,0.413188,0.978825,1.008604,1.041046,37282.442513,3.727412


In [64]:
# Save to live
filename = "wb_distance_2016"
s_file_export.f_df_export(df_wb_all,filename,p_file_id=f"{notebook_id}_",p_loc2=path_raw,p_loc2_pre="raw_")

Export | ../Data/1_raw_processed_backup/121_store_wb_distance_2016_20200811_1935.csv | COMPLETE
COPY   | ../Data/0_raw/121_raw_wb_distance_2016.csv.gzip | COMPLETE


<a id="ID_part3"></a>
### Part 3 | Calculate martices
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

**This section calculates and manipulates data for:**<br>
**A.** Economic distance (WDI)<br>
**B.** Institutional distance (WGI)
<br><br>
Both are sourced from the World Bank. 

1. A matrix of each country vs each country is created in the order of the columns in table `df_wb_all` and stored in object `distance_matrices`

2. Matrices are separated and the final distance is calculated for each kind of measure.

In [20]:
# Basics variables for matrix to determine size and labels
matrix_size = len(df_wb_all)
matrix_index = list(df_wb_all.index)

In [21]:
# storing object for matrices
distance_matrices = []

# for every column
for column in df_wb_all.columns:
    # create zero matrix
    df_matrix_zero = pd.DataFrame(np.zeros(shape = (matrix_size, matrix_size)),columns = matrix_index,index = matrix_index)
    
    # for every country (index)
    for entry in df_wb_all.index:
        
        for entry_l2 in df_wb_all.index:
            # calculate difference
            temp_difference = df_wb_all.loc[entry,column] - df_wb_all.loc[entry_l2,column]
            
            # assign calculated value to correct position in matrix
            df_matrix_zero.loc[entry,entry_l2] = temp_difference
            
    distance_matrices.append(df_matrix_zero)

**A | Economic distance | calculation and normalisation (0-1)**

In [22]:
distance_matrices[6].head()

Unnamed: 0,ARE,AZE,BEL,CHN,CZE,ESP,GEO,HUN,IRQ,ITA,KGZ,KOR,KSV,LTU,OMN,SRB,SWE,THA,TJK,UKR,VNM,AFG,ALB,ARM,AUT,BGD,BGR,BIH,BLR,BRN,BTN,CHE,DEU,DNK,EST,FIN,FRA,GRC,HKG,HRV,IDN,IND,IRL,IRN,ISR,JOR,KAZ,KHM,KWT,LAO,LBN,LUX,LVA,MAC,MDA,MKD,MMR,MNG,MNE,MYS,NLD,NOR,NPL,PAK,POL,PRT,PSE,QAT,ROU,RUS,SAU,SGP,SVK,TKM,TLS,TUR,UZB
ARE,0.0,49597.83113,15359.89609,50396.233146,28092.126434,26686.411358,51110.370471,36269.222934,54071.85243,24045.945388,59287.679138,24401.837247,53906.745607,33043.922928,35042.715678,48234.520983,13543.699244,47350.091682,60871.651844,52820.657325,57395.757533,61839.865845,51965.921973,53264.620463,11335.599583,60119.742884,44083.072004,50978.838063,46243.101692,7672.834104,53511.65207,-1751.292658,13404.602563,11998.104071,32737.382725,19038.945094,21048.587053,36145.492027,6747.440108,39250.724189,53474.293405,58128.955282,-7952.863178,49957.308743,26124.876061,54684.9832,40150.029045,60293.815951,19283.503248,57125.292289,48481.744619,-46680.842747,37564.328185,-51729.205366,53358.376029,48985.687397,59476.83555,53226.200779,45883.824085,38422.844643,11685.697022,5034.898095,61385.910198,59558.759118,35685.152434,32364.30756,58473.748341,-19133.631799,39803.101306,39843.455008,18483.194297,-25417.225529,34317.329737,50102.529153,60827.979926,37639.493702,57515.925934
AZE,-49597.83113,0.0,-34237.935041,798.402016,-21505.704697,-22911.419772,1512.539341,-13328.608196,4474.0213,-25551.885743,9689.848008,-25195.993883,4308.914477,-16553.908202,-14555.115453,-1363.310147,-36054.131887,-2247.739448,11273.820714,3222.826194,7797.926403,12242.034715,2368.090842,3666.789333,-38262.231547,10521.911754,-5514.759126,1381.006932,-3354.729438,-41924.997027,3913.82094,-51349.123788,-36193.228567,-37599.727059,-16860.448405,-30558.886036,-28549.244078,-13452.339103,-42850.391023,-10347.106941,3876.462274,8531.124152,-57550.694308,359.477612,-23472.95507,5087.15207,-9447.802085,10695.98482,-30314.327882,7527.461159,-1116.086511,-96278.673877,-12033.502945,-101327.036497,3760.544898,-612.143734,9879.00442,3628.369649,-3714.007045,-11174.986487,-37912.134108,-44562.933035,11788.079068,9960.927988,-13912.678696,-17233.52357,8875.917211,-68731.462929,-9794.729824,-9754.376122,-31114.636834,-75015.056659,-15280.501394,504.698023,11230.148795,-11958.337428,7918.094803
BEL,-15359.89609,34237.935041,0.0,35036.337056,12732.230344,11326.515268,35750.474381,20909.326844,38711.95634,8686.049298,43927.783049,9041.941157,38546.849518,17684.026838,19682.819588,32874.624893,-1816.196846,31990.195592,45511.755754,37460.761235,42035.861444,46479.969755,36606.025883,37904.724374,-4024.296507,44759.846795,28723.175914,35618.941973,30883.205602,-7687.061986,38151.755981,-17111.188747,-1955.293526,-3361.792019,17377.486635,3679.049005,5688.690963,20785.595937,-8612.455982,23890.828099,38114.397315,42769.059193,-23312.759267,34597.412653,10764.979971,39325.087111,24790.132956,44933.919861,3923.607158,41765.396199,33121.848529,-62040.738837,22204.432096,-67089.101456,37998.479939,33625.791307,44116.939461,37866.304689,30523.927996,23062.948553,-3674.199067,-10324.997994,46026.014108,44198.863028,20325.256344,17004.411471,43113.852252,-34493.527888,24443.205216,24483.558918,3123.298207,-40777.121618,18957.433647,34742.633064,45468.083836,22279.597612,42156.029844
CHN,-50396.233146,-798.402016,-35036.337056,0.0,-22304.106712,-23709.821788,714.137325,-14127.010212,3675.619284,-26350.287758,8891.445993,-25994.395899,3510.512462,-17352.310218,-15353.517468,-2161.712163,-36852.533902,-3046.141464,10475.418698,2424.424179,6999.524387,11443.632699,1569.688827,2868.387317,-39060.633563,9723.509739,-6313.161142,582.604917,-4153.131454,-42723.399042,3115.418925,-52147.525804,-36991.630582,-38398.129075,-17658.850421,-31357.288052,-29347.646093,-14250.741119,-43648.793038,-11145.508957,3078.060259,7732.722136,-58349.096323,-438.924403,-24271.357085,4288.750055,-10246.204101,9897.582805,-31112.729898,6729.059143,-1914.488527,-97077.075893,-12831.904961,-102125.438512,2962.142883,-1410.545749,9080.602404,2829.967633,-4512.409061,-11973.388503,-38710.536124,-45361.33505,10989.677052,9162.525972,-14711.080712,-18031.925586,8077.515196,-69529.864944,-10593.13184,-10552.778138,-31913.038849,-75813.458674,-16078.903409,-293.703993,10431.74678,-12756.739444,7119.692788
CZE,-28092.126434,21505.704697,-12732.230344,22304.106712,0.0,-1405.715076,23018.244037,8177.0965,25979.725996,-4046.181046,31195.552705,-3690.289187,25814.619174,4951.796494,6950.589244,20142.394549,-14548.42719,19257.965248,32779.52541,24728.530891,29303.6311,33747.739411,23873.795539,25172.494029,-16756.526851,32027.616451,15990.94557,22886.711629,18150.975258,-20419.29233,25419.525637,-29843.419092,-14687.52387,-16094.022363,4645.256291,-9053.181339,-7043.539381,8053.365593,-21344.686326,11158.597755,25382.166971,30036.828849,-36044.989611,21865.182309,-1967.250373,26592.856767,12057.902612,32201.689517,-8808.623186,29033.165855,20389.618185,-74772.969181,9472.201751,-79821.3318,25266.249595,20893.560963,31384.709116,25134.074345,17791.697652,10330.718209,-16406.429411,-23057.228338,33293.783764,31466.632684,7593.026,4272.181126,30381.621908,-47225.758232,11710.974872,11751.328574,-9608.932137,-53509.351962,6225.203303,22010.40272,32735.853492,9547.367268,29423.7995


In [104]:
# apply eucledian distance formula for consistency

In [109]:
# light touch final processing
df_de = distance_matrices[6].copy()
matrix_de = df_de.to_numpy() # create array

matrix_de = np.square(matrix_de) # square all values
# no need to divide by the number of dimensions as there is only one
matrix_de = np.sqrt(matrix_de) # square root all values

df_de = pd.DataFrame(matrix_de,index = matrix_index,columns =matrix_index)
df_de.head()

Unnamed: 0,ARE,AZE,BEL,CHN,CZE,ESP,GEO,HUN,IRQ,ITA,KGZ,KOR,KSV,LTU,OMN,SRB,SWE,THA,TJK,UKR,VNM,AFG,ALB,ARM,AUT,BGD,BGR,BIH,BLR,BRN,BTN,CHE,DEU,DNK,EST,FIN,FRA,GRC,HKG,HRV,IDN,IND,IRL,IRN,ISR,JOR,KAZ,KHM,KWT,LAO,LBN,LUX,LVA,MAC,MDA,MKD,MMR,MNG,MNE,MYS,NLD,NOR,NPL,PAK,POL,PRT,PSE,QAT,ROU,RUS,SAU,SGP,SVK,TKM,TLS,TUR,UZB
ARE,0.0,49597.83113,15359.89609,50396.233146,28092.126434,26686.411358,51110.370471,36269.222934,54071.85243,24045.945388,59287.679138,24401.837247,53906.745607,33043.922928,35042.715678,48234.520983,13543.699244,47350.091682,60871.651844,52820.657325,57395.757533,61839.865845,51965.921973,53264.620463,11335.599583,60119.742884,44083.072004,50978.838063,46243.101692,7672.834104,53511.65207,1751.292658,13404.602563,11998.104071,32737.382725,19038.945094,21048.587053,36145.492027,6747.440108,39250.724189,53474.293405,58128.955282,7952.863178,49957.308743,26124.876061,54684.9832,40150.029045,60293.815951,19283.503248,57125.292289,48481.744619,46680.842747,37564.328185,51729.205366,53358.376029,48985.687397,59476.83555,53226.200779,45883.824085,38422.844643,11685.697022,5034.898095,61385.910198,59558.759118,35685.152434,32364.30756,58473.748341,19133.631799,39803.101306,39843.455008,18483.194297,25417.225529,34317.329737,50102.529153,60827.979926,37639.493702,57515.925934
AZE,49597.83113,0.0,34237.935041,798.402016,21505.704697,22911.419772,1512.539341,13328.608196,4474.0213,25551.885743,9689.848008,25195.993883,4308.914477,16553.908202,14555.115453,1363.310147,36054.131887,2247.739448,11273.820714,3222.826194,7797.926403,12242.034715,2368.090842,3666.789333,38262.231547,10521.911754,5514.759126,1381.006932,3354.729438,41924.997027,3913.82094,51349.123788,36193.228567,37599.727059,16860.448405,30558.886036,28549.244078,13452.339103,42850.391023,10347.106941,3876.462274,8531.124152,57550.694308,359.477612,23472.95507,5087.15207,9447.802085,10695.98482,30314.327882,7527.461159,1116.086511,96278.673877,12033.502945,101327.036497,3760.544898,612.143734,9879.00442,3628.369649,3714.007045,11174.986487,37912.134108,44562.933035,11788.079068,9960.927988,13912.678696,17233.52357,8875.917211,68731.462929,9794.729824,9754.376122,31114.636834,75015.056659,15280.501394,504.698023,11230.148795,11958.337428,7918.094803
BEL,15359.89609,34237.935041,0.0,35036.337056,12732.230344,11326.515268,35750.474381,20909.326844,38711.95634,8686.049298,43927.783049,9041.941157,38546.849518,17684.026838,19682.819588,32874.624893,1816.196846,31990.195592,45511.755754,37460.761235,42035.861444,46479.969755,36606.025883,37904.724374,4024.296507,44759.846795,28723.175914,35618.941973,30883.205602,7687.061986,38151.755981,17111.188747,1955.293526,3361.792019,17377.486635,3679.049005,5688.690963,20785.595937,8612.455982,23890.828099,38114.397315,42769.059193,23312.759267,34597.412653,10764.979971,39325.087111,24790.132956,44933.919861,3923.607158,41765.396199,33121.848529,62040.738837,22204.432096,67089.101456,37998.479939,33625.791307,44116.939461,37866.304689,30523.927996,23062.948553,3674.199067,10324.997994,46026.014108,44198.863028,20325.256344,17004.411471,43113.852252,34493.527888,24443.205216,24483.558918,3123.298207,40777.121618,18957.433647,34742.633064,45468.083836,22279.597612,42156.029844
CHN,50396.233146,798.402016,35036.337056,0.0,22304.106712,23709.821788,714.137325,14127.010212,3675.619284,26350.287758,8891.445993,25994.395899,3510.512462,17352.310218,15353.517468,2161.712163,36852.533902,3046.141464,10475.418698,2424.424179,6999.524387,11443.632699,1569.688827,2868.387317,39060.633563,9723.509739,6313.161142,582.604917,4153.131454,42723.399042,3115.418925,52147.525804,36991.630582,38398.129075,17658.850421,31357.288052,29347.646093,14250.741119,43648.793038,11145.508957,3078.060259,7732.722136,58349.096323,438.924403,24271.357085,4288.750055,10246.204101,9897.582805,31112.729898,6729.059143,1914.488527,97077.075893,12831.904961,102125.438512,2962.142883,1410.545749,9080.602404,2829.967633,4512.409061,11973.388503,38710.536124,45361.33505,10989.677052,9162.525972,14711.080712,18031.925586,8077.515196,69529.864944,10593.13184,10552.778138,31913.038849,75813.458674,16078.903409,293.703993,10431.74678,12756.739444,7119.692788
CZE,28092.126434,21505.704697,12732.230344,22304.106712,0.0,1405.715076,23018.244037,8177.0965,25979.725996,4046.181046,31195.552705,3690.289187,25814.619174,4951.796494,6950.589244,20142.394549,14548.42719,19257.965248,32779.52541,24728.530891,29303.6311,33747.739411,23873.795539,25172.494029,16756.526851,32027.616451,15990.94557,22886.711629,18150.975258,20419.29233,25419.525637,29843.419092,14687.52387,16094.022363,4645.256291,9053.181339,7043.539381,8053.365593,21344.686326,11158.597755,25382.166971,30036.828849,36044.989611,21865.182309,1967.250373,26592.856767,12057.902612,32201.689517,8808.623186,29033.165855,20389.618185,74772.969181,9472.201751,79821.3318,25266.249595,20893.560963,31384.709116,25134.074345,17791.697652,10330.718209,16406.429411,23057.228338,33293.783764,31466.632684,7593.026,4272.181126,30381.621908,47225.758232,11710.974872,11751.328574,9608.932137,53509.351962,6225.203303,22010.40272,32735.853492,9547.367268,29423.7995


In [108]:
# rescale (0-1)
scaler=preprocessing.MinMaxScaler() # define scaler
# fit data on reshaped one dimensional array (this preserves matching distances between pairs)
print(scaler.fit(matrix_de.reshape((5929,1)))) 
matrix_de_minmax = scaler.transform(matrix_de.reshape((5929,1))) # save as array

# back to data frame
df_de_minmax = pd.DataFrame(matrix_de_minmax.reshape((77,77)),index = matrix_index,columns =matrix_index)
df_de_minmax.head()

MinMaxScaler(copy=True, feature_range=(0, 1))


Unnamed: 0,ARE,AZE,BEL,CHN,CZE,ESP,GEO,HUN,IRQ,ITA,KGZ,KOR,KSV,LTU,OMN,SRB,SWE,THA,TJK,UKR,VNM,AFG,ALB,ARM,AUT,BGD,BGR,BIH,BLR,BRN,BTN,CHE,DEU,DNK,EST,FIN,FRA,GRC,HKG,HRV,IDN,IND,IRL,IRN,ISR,JOR,KAZ,KHM,KWT,LAO,LBN,LUX,LVA,MAC,MDA,MKD,MMR,MNG,MNE,MYS,NLD,NOR,NPL,PAK,POL,PRT,PSE,QAT,ROU,RUS,SAU,SGP,SVK,TKM,TLS,TUR,UZB
ARE,0.0,0.43672,0.135247,0.44375,0.247357,0.23498,0.450038,0.319358,0.476114,0.21173,0.522041,0.214863,0.47466,0.290959,0.308559,0.424715,0.119255,0.416928,0.535988,0.465097,0.505382,0.544513,0.457571,0.469006,0.099812,0.529367,0.388161,0.44888,0.40718,0.067561,0.471182,0.015421,0.11803,0.105646,0.28826,0.167642,0.185337,0.318269,0.059413,0.345611,0.470853,0.511838,0.070027,0.439885,0.230035,0.481513,0.35353,0.5309,0.169795,0.503,0.426892,0.411035,0.330762,0.455487,0.469832,0.431329,0.523706,0.468668,0.404017,0.338321,0.102895,0.044333,0.540516,0.524428,0.314215,0.284975,0.514874,0.168476,0.350475,0.35083,0.162748,0.223804,0.302171,0.441164,0.535603,0.331424,0.50644
AZE,0.43672,0.0,0.301472,0.00703,0.189362,0.20174,0.013318,0.117361,0.039395,0.22499,0.085321,0.221856,0.037941,0.145761,0.128161,0.012004,0.317464,0.019792,0.099268,0.028378,0.068662,0.107794,0.020852,0.032287,0.336907,0.092648,0.048559,0.01216,0.029539,0.369159,0.034462,0.45214,0.318689,0.331074,0.14846,0.269078,0.251382,0.118451,0.377307,0.091108,0.034133,0.075118,0.506746,0.003165,0.206684,0.044793,0.08319,0.09418,0.266924,0.066281,0.009827,0.847754,0.105958,0.892206,0.033112,0.00539,0.086987,0.031949,0.032703,0.098398,0.333824,0.392386,0.103797,0.087708,0.122504,0.151745,0.078154,0.605195,0.086245,0.085889,0.273971,0.660524,0.134548,0.004444,0.098884,0.105296,0.069721
BEL,0.135247,0.301472,0.0,0.308502,0.11211,0.099732,0.314791,0.184111,0.340867,0.076483,0.386794,0.079616,0.339413,0.155712,0.173311,0.289468,0.015992,0.281681,0.400741,0.32985,0.370135,0.409266,0.322324,0.333759,0.035435,0.39412,0.252914,0.313632,0.271933,0.067686,0.335934,0.150668,0.017217,0.029601,0.153012,0.032395,0.05009,0.183022,0.075835,0.210364,0.335605,0.376591,0.205274,0.304638,0.094788,0.346266,0.218282,0.395653,0.034548,0.367753,0.291645,0.546282,0.195515,0.590734,0.334585,0.296082,0.388459,0.333421,0.26877,0.203074,0.032352,0.090914,0.405269,0.38918,0.178968,0.149727,0.379627,0.303723,0.215228,0.215583,0.027501,0.359051,0.166924,0.305916,0.400356,0.196177,0.371193
CHN,0.44375,0.00703,0.308502,0.0,0.196392,0.20877,0.006288,0.124391,0.032365,0.23202,0.078291,0.228886,0.030911,0.152791,0.135191,0.019034,0.324494,0.026822,0.092238,0.021348,0.061632,0.100764,0.013821,0.025257,0.343937,0.085618,0.055589,0.00513,0.036569,0.376189,0.027432,0.45917,0.325719,0.338104,0.15549,0.276108,0.258412,0.125481,0.384337,0.098139,0.027103,0.068088,0.513776,0.003865,0.213714,0.037763,0.09022,0.08715,0.273954,0.059251,0.016857,0.854784,0.112988,0.899236,0.026082,0.01242,0.079957,0.024918,0.039733,0.105428,0.340855,0.399416,0.096766,0.080678,0.129534,0.158775,0.071124,0.612225,0.093275,0.092919,0.281001,0.667554,0.141578,0.002586,0.091854,0.112326,0.06269
CZE,0.247357,0.189362,0.11211,0.196392,0.0,0.012378,0.202681,0.072001,0.228757,0.035627,0.274684,0.032494,0.227303,0.043602,0.061201,0.177358,0.128102,0.169571,0.288631,0.21774,0.258025,0.297156,0.210214,0.221649,0.147545,0.28201,0.140804,0.201522,0.159823,0.179796,0.223824,0.262778,0.129327,0.141711,0.040902,0.079715,0.06202,0.070912,0.187945,0.098254,0.223495,0.264481,0.317384,0.192528,0.017322,0.234156,0.106172,0.283543,0.077562,0.255643,0.179535,0.658392,0.083405,0.702844,0.222475,0.183972,0.276349,0.221311,0.15666,0.090964,0.144462,0.203024,0.293159,0.27707,0.066858,0.037617,0.267517,0.415833,0.103118,0.103473,0.084609,0.471161,0.054814,0.193806,0.288246,0.084067,0.259083


**B | Institutional distance | calculation**

Nice explanation of eucledian distance and formulae, [here](https://www.pbarrett.net/techpapers/euclid.pdf)
1. take difference for one dimension / column / matrix
2. square (1)
3. divide (2) by variance (to standardise the data)
4. sum across dimensions
5. **square root the results**



Equation from Wang et al. (2020) (based in KSI)
1. take difference for one variable
2. square (1)
3. divide (2) variance (to standardise the data)
4. sum across dimensions
5. **divide by number of dimensions**

In [100]:
# list of matrices
list_processed_matrix = []
institutional_variables = distance_matrices[0:6]

# we have five spatial weight matrices which need to be treated together
for index,matrix in enumerate(institutional_variables):
    # Step 1 | take matrix which has step 1 done
    
    # Step 2 | square values
    matrix_square = np.square(distance_matrices[0].to_numpy())

    # Step 3a | calculate variance
    matrix_var = distance_matrices[0].to_numpy().var()

    # Step 3b | step 2 divided by var
    matrix_processed = matrix_square / matrix_var
    
    # Step 4a | add all array into a list for easier summing
    list_processed_matrix.append(matrix_processed)

# Step 4b | Sum all
matrix_di = sum(list_processed_matrix)

# Step 5 | Square root
matrix_di = np.sqrt(matrix_di)
# Step 5 | incorrect step(?) from Wang et al (2020) based on the standard practice
#matrix_di = matrix_di / len(institutional_variables) 

# create df
df_di = pd.DataFrame(matrix_di,index = matrix_index,columns=matrix_index)
df_di.head()

Unnamed: 0,ARE,AZE,BEL,CHN,CZE,ESP,GEO,HUN,IRQ,ITA,KGZ,KOR,KSV,LTU,OMN,SRB,SWE,THA,TJK,UKR,VNM,AFG,ALB,ARM,AUT,BGD,BGR,BIH,BLR,BRN,BTN,CHE,DEU,DNK,EST,FIN,FRA,GRC,HKG,HRV,IDN,IND,IRL,IRN,ISR,JOR,KAZ,KHM,KWT,LAO,LBN,LUX,LVA,MAC,MDA,MKD,MMR,MNG,MNE,MYS,NLD,NOR,NPL,PAK,POL,PRT,PSE,QAT,ROU,RUS,SAU,SGP,SVK,TKM,TLS,TUR,UZB
ARE,0.0,3.357364,0.779955,2.381252,1.060467,1.095752,0.812288,1.794522,4.273378,1.815961,3.759542,1.19382,2.64962,0.764718,1.389944,2.495522,1.703775,2.606598,3.871828,3.316606,2.712629,4.506175,2.63379,2.912185,0.629361,3.388405,2.23826,2.719285,2.38628,1.007585,0.075499,1.363927,1.121857,1.768495,0.168078,1.783421,0.38152,2.110731,0.651323,1.61734,2.62387,2.428347,0.687514,3.145061,0.025219,1.513853,3.320976,4.082902,2.404881,3.547666,3.568967,1.545134,1.236867,0.543674,3.546401,2.433394,2.994331,2.770637,2.122648,1.796514,1.238255,1.711697,3.292908,3.419094,0.722515,0.411195,2.206574,0.452495,1.98819,3.330949,1.571288,1.530517,1.574208,4.493272,2.809849,2.27687,3.909161
AZE,3.357364,0.0,4.137319,0.976112,2.296897,2.261612,2.545075,1.562842,0.916014,1.541402,0.402178,2.163544,0.707744,2.592646,1.96742,0.861841,5.061139,0.750766,0.514464,0.040758,0.644734,1.148812,0.723574,0.445178,3.986724,0.031041,1.119104,0.638078,0.971084,2.349778,3.281864,4.721291,4.479221,5.125859,3.525441,5.140785,3.738883,1.246633,4.008687,1.740024,0.733493,0.929017,4.044877,0.212302,3.382583,1.843511,0.036387,0.725538,0.952483,0.190302,0.211604,4.902498,2.120497,2.813689,0.189037,0.92397,0.363033,0.586726,1.234716,1.56085,4.595619,5.06906,0.064456,0.061731,2.634849,2.946168,1.150789,2.904868,1.369173,0.026415,1.786076,4.88788,1.783156,1.135908,0.547514,1.080494,0.551797
BEL,0.779955,4.137319,0.0,3.161207,1.840422,1.875707,1.592243,2.574477,5.053333,2.595916,4.539497,1.973774,3.429575,1.544673,2.169899,3.275477,0.92382,3.386552,4.651783,4.096561,3.492584,5.28613,3.413745,3.69214,0.150594,4.16836,3.018215,3.49924,3.166235,1.78754,0.855454,0.583972,0.341902,0.98854,0.611877,1.003467,0.398435,2.890686,0.128632,2.397295,3.403825,3.208302,0.092441,3.925016,0.754736,2.293808,4.100931,4.862857,3.184836,4.327621,4.348922,0.765179,2.016821,1.323629,4.326356,3.213349,3.774286,3.550592,2.902603,2.576469,0.4583,0.931742,4.072863,4.199049,1.50247,1.19115,2.986529,1.23245,2.768145,4.110904,2.351243,0.750562,2.354163,5.273227,3.589804,3.056824,4.689116
CHN,2.381252,0.976112,3.161207,0.0,1.320785,1.2855,1.568963,0.58673,1.892126,0.56529,1.37829,1.187432,0.268368,1.616534,0.991308,0.114271,4.085027,0.225346,1.490576,0.935354,0.331377,2.124924,0.252538,0.530934,3.010612,1.007153,0.142992,0.338034,0.005028,1.373666,2.305752,3.745179,3.503109,4.149747,2.549329,4.164673,2.762772,0.270521,3.032575,0.763912,0.242619,0.047095,3.068765,0.76381,2.406471,0.867399,0.939725,1.70165,0.023629,1.166414,1.187716,3.926386,1.144385,1.837577,1.165149,0.052142,0.613079,0.389386,0.258604,0.584738,3.619507,4.092948,0.911656,1.037842,1.658737,1.970057,0.174677,1.928756,0.393061,0.949697,0.809964,3.911769,0.807044,2.11202,0.428597,0.104382,1.527909
CZE,1.060467,2.296897,1.840422,1.320785,0.0,0.035285,0.248178,0.734055,3.212911,0.755495,2.699075,0.133353,1.589153,0.295749,0.329477,1.435056,2.764242,1.546131,2.811361,2.256139,1.652162,3.445709,1.573323,1.851718,1.689827,2.327938,1.177793,1.658819,1.325813,0.052881,0.984968,2.424394,2.182324,2.828962,1.228544,2.843888,1.441987,1.050264,1.71179,0.556873,1.563404,1.36788,1.74798,2.084594,1.085686,0.453386,2.26051,3.022435,1.344414,2.487199,2.508501,2.605601,0.1764,0.516793,2.485934,1.372927,1.933864,1.71017,1.062181,0.736047,2.298722,2.772164,2.232441,2.358627,0.337952,0.649272,1.146108,0.607971,0.927724,2.270482,0.510821,2.590984,0.513741,3.432805,1.749382,1.216403,2.848694


In [102]:
# rescale (0-1)
scaler=preprocessing.MinMaxScaler() # define scaler
# fit data on reshaped one dimensional array (this preserves matching distances between pairs)
print(scaler.fit(matrix_di.reshape((5929,1)))) 
matrix_di_minmax = scaler.transform(matrix_di.reshape((5929,1))) # save as array

# back to data frame
df_di_minmax = pd.DataFrame(matrix_di_minmax.reshape((77,77)),index = matrix_index,columns =matrix_index)
df_di_minmax.head()

MinMaxScaler(copy=True, feature_range=(0, 1))


Unnamed: 0,ARE,AZE,BEL,CHN,CZE,ESP,GEO,HUN,IRQ,ITA,KGZ,KOR,KSV,LTU,OMN,SRB,SWE,THA,TJK,UKR,VNM,AFG,ALB,ARM,AUT,BGD,BGR,BIH,BLR,BRN,BTN,CHE,DEU,DNK,EST,FIN,FRA,GRC,HKG,HRV,IDN,IND,IRL,IRN,ISR,JOR,KAZ,KHM,KWT,LAO,LBN,LUX,LVA,MAC,MDA,MKD,MMR,MNG,MNE,MYS,NLD,NOR,NPL,PAK,POL,PRT,PSE,QAT,ROU,RUS,SAU,SGP,SVK,TKM,TLS,TUR,UZB
ARE,0.0,0.533796,0.124007,0.378602,0.168606,0.174217,0.129148,0.285316,0.679436,0.288725,0.59774,0.189809,0.42127,0.121585,0.220991,0.39677,0.270888,0.41443,0.615592,0.527316,0.431288,0.716449,0.418753,0.463016,0.100064,0.538732,0.355867,0.432347,0.379401,0.160199,0.012004,0.216854,0.178367,0.281178,0.026723,0.283551,0.060659,0.335591,0.103556,0.257145,0.417176,0.386089,0.10931,0.500042,0.00401,0.240692,0.528011,0.649152,0.382359,0.564053,0.56744,0.245665,0.196653,0.08644,0.563852,0.386892,0.476077,0.440511,0.337486,0.285633,0.196874,0.272147,0.523548,0.543611,0.114875,0.065377,0.350829,0.071943,0.316108,0.529597,0.249823,0.243341,0.250288,0.714397,0.446746,0.362006,0.621528
AZE,0.533796,0.0,0.657803,0.155195,0.36519,0.35958,0.404648,0.24848,0.14564,0.245072,0.063943,0.343988,0.112526,0.412212,0.312805,0.137026,0.804684,0.119366,0.081796,0.00648,0.102508,0.182653,0.115043,0.07078,0.63386,0.004935,0.177929,0.10145,0.154395,0.373598,0.521792,0.750651,0.712163,0.814974,0.560519,0.817347,0.594455,0.198206,0.637352,0.276651,0.11662,0.147707,0.643106,0.033755,0.537806,0.293105,0.005785,0.115355,0.151438,0.030257,0.033643,0.779461,0.337144,0.447356,0.030056,0.146904,0.05772,0.093285,0.196311,0.248164,0.73067,0.805944,0.010248,0.009815,0.418922,0.468419,0.182967,0.461853,0.217689,0.0042,0.283973,0.777137,0.283509,0.180601,0.087051,0.171791,0.087732
BEL,0.124007,0.657803,0.0,0.502609,0.292614,0.298224,0.253155,0.409323,0.803443,0.412732,0.721747,0.313816,0.545277,0.245592,0.344998,0.520777,0.146881,0.538437,0.7396,0.651323,0.555295,0.840456,0.542761,0.587023,0.023943,0.662739,0.479874,0.556354,0.503408,0.284206,0.136011,0.092847,0.05436,0.157171,0.097284,0.159544,0.063348,0.459598,0.020452,0.381152,0.541183,0.510097,0.014697,0.624049,0.119997,0.364699,0.652018,0.773159,0.506366,0.68806,0.691447,0.121658,0.32066,0.210447,0.687859,0.510899,0.600084,0.564518,0.461493,0.40964,0.072866,0.14814,0.647555,0.667618,0.238882,0.189384,0.474836,0.195951,0.440115,0.653604,0.37383,0.119334,0.374295,0.838405,0.570753,0.486013,0.745535
CHN,0.378602,0.155195,0.502609,0.0,0.209995,0.204385,0.249454,0.093286,0.300834,0.089877,0.219138,0.188793,0.042669,0.257017,0.157611,0.018168,0.649489,0.035828,0.236991,0.148714,0.052687,0.337847,0.040152,0.084415,0.478665,0.16013,0.022735,0.053745,0.000799,0.218403,0.366598,0.595456,0.556969,0.659779,0.405325,0.662153,0.439261,0.043011,0.482157,0.121456,0.038575,0.007488,0.487911,0.12144,0.382611,0.13791,0.149409,0.27055,0.003757,0.185451,0.188838,0.624267,0.181949,0.292161,0.18525,0.00829,0.097475,0.061909,0.041116,0.092969,0.575475,0.650749,0.144947,0.165009,0.263727,0.313225,0.027772,0.306658,0.062494,0.150995,0.128778,0.621943,0.128314,0.335796,0.068144,0.016596,0.242926
CZE,0.168606,0.36519,0.292614,0.209995,0.0,0.00561,0.039459,0.116709,0.510829,0.120118,0.429133,0.021202,0.252664,0.047022,0.052384,0.228163,0.439494,0.245823,0.446986,0.35871,0.262682,0.547843,0.250147,0.29441,0.26867,0.370125,0.18726,0.26374,0.210795,0.008408,0.156603,0.385461,0.346974,0.449784,0.19533,0.452157,0.229265,0.166984,0.272162,0.088539,0.24857,0.217483,0.277916,0.331435,0.172616,0.072085,0.359405,0.480545,0.213752,0.395447,0.398833,0.414272,0.028046,0.082166,0.395245,0.218285,0.30747,0.271905,0.168879,0.117026,0.36548,0.440754,0.354942,0.375005,0.053732,0.103229,0.182223,0.096663,0.147501,0.36099,0.081217,0.411947,0.081681,0.545791,0.278139,0.193399,0.452922


**C | Rescaled distances | calculation**

<a id="ID_part4"></a>
### Part 4 | Export
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [110]:
# Economic distance matrix
file_name = f"de_matrix"
s_file_export.f_df_export(df_de.reset_index(),file_name,p_file_id=f"{notebook_id}_")
s_file_export.f_df_export(df_de_minmax.reset_index(),"de_min_max",p_file_id=f"{notebook_id}_")

Export | ../Data/1_raw_processed_backup/121_store_de_matrix_20200809_2238.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/121_input_de_matrix.csv.gzip | COMPLETE
Export | ../Data/1_raw_processed_backup/121_store_de_min_max_20200809_2238.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/121_input_de_min_max.csv.gzip | COMPLETE


In [111]:
# Economic distance matrix
file_name = f"di_matrix"
s_file_export.f_df_export(df_di.reset_index(),file_name,p_file_id=f"{notebook_id}_")
s_file_export.f_df_export(df_di_minmax.reset_index(),"di_min_max",p_file_id=f"{notebook_id}_")

Export | ../Data/1_raw_processed_backup/121_store_di_matrix_20200809_2238.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/121_input_di_matrix.csv.gzip | COMPLETE
Export | ../Data/1_raw_processed_backup/121_store_di_min_max_20200809_2238.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/121_input_di_min_max.csv.gzip | COMPLETE


<a id="ID_part5"></a>
### Part 5
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||