In [1]:
notebook_id = "125"

<a id="ID_top"></a>
## Merge scope data with distance calculations and filter away any empty datapoints

Workflow takes in inputs 120 and 121 (all) and merges them together. After that any distance that were not calculated are used to filter out those entries / countries.

This workflows returns dataset 125 which is ready for country level network analysis.

#### Notebook sections:
    
|| [0| Default imports](#ID_top) || [1|Part1 Reference table generation](#ID_part1) || [2|Part2 Table maintanence](#ID_part2) || [3|Part3 Table export](#ID_part3) || 

#### Import all packages that could be required

In [2]:
# %load s_package_import.py
# package library, use to ensure consistency across notebooks, refresh periodically
# general packages
import os # use with os.listdir(_path_)
import requests
import csv
import time
from datetime import datetime
from shutil import copyfile

#temp check
#from shutil import make_archive
import zipfile #notebook

# data analysis packages
import pandas as pd
pd.options.display.max_columns = None # don't truncate columns
#pd.options.display.max_rows = None

import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import descartes
import pycountry

# custom scripts
import s_file_export
import s_filepaths
import s_un_comtrade_extract as s_un
import s_adj_matrix_plot

#=== network analysis
import networkx as nx
#=== gavity modelling
import gme as gme

#=== distance datasets
import wbdata


  import pandas.util.testing as tm


#### Import module and declare path variables
`import s_filepaths.py`

In [3]:
# import ref file
import s_filepaths

# declare local variables to work with
path_raw = s_filepaths.path_raw
path_raw_dl = s_filepaths.path_raw_dl
path_store = s_filepaths.path_store
path_live = s_filepaths.path_live

<a id="ID_part1"></a>
### Part 1 | Load and Merge
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [4]:
# View all files in live directory
sorted(list(os.listdir(f"{path_live}")))

['.DS_Store',
 '112_input_gme_data.csv.gzip',
 '113_input_bri_members.csv.gzip',
 '120_input_scope_data.csv.gzip',
 '121_input_de_matrix.csv.gzip',
 '121_input_de_min_max.csv.gzip',
 '121_input_di_matrix.csv.gzip',
 '121_input_di_min_max.csv.gzip',
 '122_input_un_com_2016_mini.csv.gzip',
 '125_input_scope_country.csv.gzip',
 '2_raw_explainer_doc.md']

**Import 120 | GME data**

In [14]:
df_scope_raw.columns

Index(['year', 'country_d', 'iso3_d', 'dynamic_code_d', 'landlocked_d',
       'island_d', 'region_d', 'gdp_pwt_const_d', 'pop_d', 'lat_d', 'lng_d',
       'country_o', 'iso3_o', 'dynamic_code_o', 'landlocked_o', 'island_o',
       'region_o', 'pop_o', 'lat_o', 'lng_o', 'contiguity', 'distance',
       'bri_year_d', 'bri_flag_d'],
      dtype='object')

In [5]:
# import gme data (120)
df_scope_raw = pd.read_csv(f"{path_live}120_input_scope_data.csv.gzip",compression="gzip")
df_scope_raw.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.8744,0,9693.0625,,
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0
4,2016,Czech Republic,CZE,CZE,1,0,europe,,,49.817062,15.696862,Holy See,VAT,VAT,1,0,europe,,41.900013,12.447808,0,925.0072,2015.0,1.0


**Import 121 | Distance data**

In [6]:
# import distance measures (120)
df_de = pd.read_csv(f"{path_live}121_input_de_matrix.csv.gzip",compression="gzip",index_col="index")
df_de_minmax = pd.read_csv(f"{path_live}121_input_de_min_max.csv.gzip",compression="gzip",index_col="index")

df_di = pd.read_csv(f"{path_live}121_input_di_matrix.csv.gzip",compression="gzip",index_col="index")
df_di_minmax = pd.read_csv(f"{path_live}121_input_di_min_max.csv.gzip",compression="gzip",index_col="index")
#df_di.head()

In [7]:
# add arrays as pair columns

# list of arrays
distance_array_list = [df_de,df_de_minmax,df_di,df_di_minmax]
# list of column names
distante_array_columns = ["economic_distance","de_min_max","institutional_distance","di_min_max"]

In [8]:
def f_array_to_column(p_df,p_df_array,p_col_name):
    # create series based on combo of iso3_o and iso3_d
    paired_values = []
    
    f_df = p_df.copy()

    for entry in list(p_df.index):
        # pair of countries in gravity dataset
        df_pair = (f_df.iloc[entry].iso3_o, f_df.iloc[entry].iso3_d)

        try: 
            distance_value = p_df_array.loc[df_pair[0],df_pair[1]]
            paired_values.append(distance_value)
        except:
            paired_values.append(np.NaN)

    f_df[p_col_name] = paired_values
    
    return f_df

In [9]:
df_scope_distance = df_scope_raw.copy()

for index,entry in enumerate(distance_array_list):
    df_scope_distance = f_array_to_column(df_scope_distance,entry,distante_array_columns[index]).copy()
    
df_scope_distance.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d,economic_distance,de_min_max,institutional_distance,di_min_max
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0,32364.30756,0.284975,0.411195,0.065377
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0,30314.327882,0.266924,0.952483,0.151438
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.8744,0,9693.0625,,,44933.919861,0.395653,4.862857,0.773159
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0,4153.131454,0.036569,0.005028,0.000799
4,2016,Czech Republic,CZE,CZE,1,0,europe,,,49.817062,15.696862,Holy See,VAT,VAT,1,0,europe,,41.900013,12.447808,0,925.0072,2015.0,1.0,,,,


**Import 122 | Flow data**

In [10]:
# Load mini comtrade dataset (trimmed down in workflow 122)
df_un_com = pd.read_csv(f"{path_live}122_input_un_com_2016_mini.csv.gzip",compression="gzip")

df_un_com_export = df_un_com[df_un_com.rgDesc == "Export"].copy()
df_un_com_export.rename(columns={"TradeValue": "Export_value"},inplace =True)

df_un_com_import = df_un_com[df_un_com.rgDesc == "Import"].copy()
df_un_com_import.rename(columns={"TradeValue": "Import_value"},inplace = True)
df_un_com_export.head()

Unnamed: 0,rtCode,rt3ISO,ptCode,pt3ISO,rgDesc,yr,Export_value
1,784,ARE,0,WLD,Export,2016,295046691148
4,784,ARE,4,AFG,Export,2016,1596610735
7,784,ARE,8,ALB,Export,2016,11867263
10,784,ARE,12,DZA,Export,2016,374836964
13,784,ARE,20,AND,Export,2016,89855


In [11]:
# merge imports
df_scope_raw_im = df_scope_distance.merge(
    df_un_com_import
    ,how = "left",left_on = ["year","iso3_d","iso3_o"],
    # the arrangement of pt and rt columns matter in the key as they indicate the flow of goods 
    # depending on whether it is imports or exports
    right_on = ["yr","rt3ISO","pt3ISO"], # for imports, the reporter is the destination, and partner is the origin
                  ).copy()

# drop uneccesary columns
df_scope_raw_im.drop(labels = ["yr","rgDesc","pt3ISO","rt3ISO","ptCode","rtCode"], axis = 1, inplace = True)

# merge exports
df_scope_raw_im_ex = df_scope_raw_im.merge(
    df_un_com_export
    ,how = "left",left_on = ["year","iso3_d","iso3_o"],
    # the arrangement of pt and rt columns matter in the key as they indicate the flow of goods 
    # depending on whether it is imports or exports
    right_on = ["yr","pt3ISO","rt3ISO"], # for imports, the reporter is the destination, and partner is the origin
                  ).copy()

# drop uneccesary columns
df_scope_raw_im_ex.drop(labels = ["yr","rgDesc"], axis = 1, inplace = True)

# drop uneccesary columns
#df_scope_raw_im_ex.drop(labels = ["yr_y","yr_x","rgDesc_y","pt3ISO_y","rt3ISO_y",""], axis = 1, inplace = True)

In [12]:
df_scope_raw_im_ex.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d,economic_distance,de_min_max,institutional_distance,di_min_max,Import_value,rtCode,rt3ISO,ptCode,pt3ISO,Export_value
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0,32364.30756,0.284975,0.411195,0.065377,218350994.0,620.0,PRT,784.0,ARE,179403385.0
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0,30314.327882,0.266924,0.952483,0.151438,3530921.0,414.0,KWT,31.0,AZE,1235241.0
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.8744,0,9693.0625,,,44933.919861,0.395653,4.862857,0.773159,367037492.0,116.0,KHM,56.0,BEL,396804738.0
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0,4153.131454,0.036569,0.005028,0.000799,435188908.0,112.0,BLR,156.0,CHN,400511800.0
4,2016,Czech Republic,CZE,CZE,1,0,europe,,,49.817062,15.696862,Holy See,VAT,VAT,1,0,europe,,41.900013,12.447808,0,925.0072,2015.0,1.0,,,,,6045.0,,,,,


<a id="ID_part2"></a>
### Part 2 | Final scope filter
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [16]:
# remove any entries with no distance measurement
df_scope = df_scope_raw_im_ex[~df_scope_raw_im_ex.economic_distance.isnull()].copy()

In [32]:
print(f"Country count | {len(df_scope.iso3_d.unique())}")
df_scope.iso3_d.unique()

Country count | 75


array(['ARE', 'AZE', 'BEL', 'CHN', 'ESP', 'GEO', 'HUN', 'IRQ', 'KGZ',
       'LTU', 'OMN', 'SRB', 'SWE', 'TJK', 'UKR', 'VNM', 'AFG', 'ALB',
       'ARM', 'AUT', 'BGD', 'BGR', 'BIH', 'BLR', 'BRN', 'BTN', 'CHE',
       'CZE', 'DEU', 'DNK', 'EST', 'FIN', 'FRA', 'GRC', 'HKG', 'HRV',
       'IDN', 'IND', 'IRN', 'ISR', 'ITA', 'JOR', 'KAZ', 'KHM', 'KSV',
       'KWT', 'LAO', 'LBN', 'LUX', 'LVA', 'MAC', 'MDA', 'MKD', 'MMR',
       'MNG', 'MNE', 'MYS', 'NLD', 'NOR', 'NPL', 'PAK', 'POL', 'PRT',
       'PSE', 'QAT', 'ROU', 'RUS', 'SAU', 'SGP', 'SVK', 'THA', 'TKM',
       'TLS', 'TUR', 'UZB'], dtype=object)

In [18]:
# remove any islands or areas we know will not connect(i.e. Korea)
ireland_index = df_scope[(df_scope.country_d == "Ireland") | (df_scope.country_o == "Ireland")].index
# drop Ireland index
df_scope.drop(ireland_index, inplace= True)

In [19]:
# remove any islands or areas we know will not connect(i.e. Korea)
korea_index = df_scope[(df_scope.iso3_d == "KOR") | (df_scope.iso3_o == "KOR")].index
# drop Ireland index
df_scope.drop(korea_index, inplace= True)

In [29]:
# amend russia-poland contiguity
rus_pol_index = df_scope[((df_scope.iso3_d == "RUS") & (df_scope.iso3_o == "POL")) |
         ((df_scope.iso3_o == "RUS") & (df_scope.iso3_d == "POL"))      
        ].index

df_scope.loc[rus_pol_index,"contiguity"] = 0

In [35]:
# amend russia-lithuania contiguity
rus_pol_index = df_scope[((df_scope.iso3_d == "RUS") & (df_scope.iso3_o == "LTU")) |
         ((df_scope.iso3_o == "RUS") & (df_scope.iso3_d == "LTU"))      
        ].index

df_scope.loc[rus_pol_index,"contiguity"] = 0

df_scope[((df_scope.iso3_d == "RUS") & (df_scope.iso3_o == "LTU")) |
         ((df_scope.iso3_o == "RUS") & (df_scope.iso3_d == "LTU"))      
        ]

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d,economic_distance,de_min_max,institutional_distance,di_min_max,Import_value,rtCode,rt3ISO,ptCode,pt3ISO,Export_value
4240,2016,Lithuania,LTU,LTU,0,0,europe,,,55.406567,23.602331,Russia,RUS,RUS,0,0,eurasia,,56.643204,73.978935,0,2021.7047,2017.0,1.0,6799.53208,0.059871,2.566231,0.408012,3793243000.0,643.0,RUS,440.0,LTU,2554986000.0
6096,2016,Russia,RUS,RUS,0,0,eurasia,,,56.643204,73.978935,Lithuania,LTU,LTU,0,0,europe,,55.406567,23.602331,0,2021.7047,,1.0,6799.53208,0.059871,2.566231,0.408012,467472500.0,440.0,LTU,643.0,RUS,3371119000.0


In [36]:
# amend russia-china contiguity
rus_pol_index = df_scope[((df_scope.iso3_d == "RUS") & (df_scope.iso3_o == "CHN")) |
         ((df_scope.iso3_o == "RUS") & (df_scope.iso3_d == "CHN"))      
        ].index

df_scope.loc[rus_pol_index,"contiguity"] = 0

df_scope[((df_scope.iso3_d == "RUS") & (df_scope.iso3_o == "CHN")) |
         ((df_scope.iso3_o == "RUS") & (df_scope.iso3_d == "CHN"))      
        ]

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d,economic_distance,de_min_max,institutional_distance,di_min_max,Import_value,rtCode,rt3ISO,ptCode,pt3ISO,Export_value
1397,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Russia,RUS,RUS,0,0,eurasia,,56.643204,73.978935,0,5356.7427,2013.0,1.0,10552.778138,0.092919,0.949697,0.150995,32260150000.0,643.0,RUS,156.0,CHN,29953380000.0
6142,2016,Russia,RUS,RUS,0,0,eurasia,,,56.643204,73.978935,China,CHN,CHN,0,0,east_asia,,35.389668,114.00247,0,5356.7427,,1.0,10552.778138,0.092919,0.949697,0.150995,46332240000.0,156.0,CHN,643.0,RUS,37339600000.0


In [37]:
print(f"Country count | {len(df_scope.iso3_d.unique())}")
df_scope.head()

Country count | 75


Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d,economic_distance,de_min_max,institutional_distance,di_min_max,Import_value,rtCode,rt3ISO,ptCode,pt3ISO,Export_value
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0,32364.30756,0.284975,0.411195,0.065377,218351000.0,620.0,PRT,784.0,ARE,179403400.0
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0,30314.327882,0.266924,0.952483,0.151438,3530921.0,414.0,KWT,31.0,AZE,1235241.0
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.8744,0,9693.0625,,,44933.919861,0.395653,4.862857,0.773159,367037500.0,116.0,KHM,56.0,BEL,396804700.0
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0,4153.131454,0.036569,0.005028,0.000799,435188900.0,112.0,BLR,156.0,CHN,400511800.0
5,2016,Spain,ESP,ESP,0,0,europe,,,39.231297,-4.26384,Belgium,BEL,BEL,0,0,europe,,50.691814,4.581812,0,1374.9351,,,11326.515268,0.099732,1.875707,0.298224,7757618000.0,56.0,BEL,724.0,ESP,10493980000.0


***

<a id="ID_part3"></a>
### Part 3 | Export
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [38]:
# Export for visualisation
file_name = f"scope_country"
s_file_export.f_df_export(df_scope,file_name,p_file_id=f"{notebook_id}_")

Export | ../Data/1_raw_processed_backup/125_store_scope_country_20200812_0343.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/125_input_scope_country.csv.gzip | COMPLETE
