In [1]:
notebook_id = "125"

<a id="ID_top"></a>
## Merge scope data with distance calculations and filter away any empty datapoints

Workflow takes in inputs 120 and 121 (all) and merges them together. After that any distance that were not calculated are used to filter out those entries / countries.

This workflows returns dataset 125 which is ready for country level network analysis.

#### Notebook sections:
    
|| [0| Default imports](#ID_top) || [1|Part1 Reference table generation](#ID_part1) || [2|Part2 Table maintanence](#ID_part2) || [3|Part3 Table export](#ID_part3) || 

#### Import all packages that could be required

In [2]:
# %load s_package_import.py
# package library, use to ensure consistency across notebooks, refresh periodically
# general packages
import os # use with os.listdir(_path_)
import requests
import csv
import time
from datetime import datetime
from shutil import copyfile

#temp check
#from shutil import make_archive
import zipfile #notebook

# data analysis packages
import pandas as pd
pd.options.display.max_columns = None # don't truncate columns
#pd.options.display.max_rows = None

import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import descartes
import pycountry

# custom scripts
import s_file_export
import s_filepaths
import s_un_comtrade_extract as s_un
import s_adj_matrix_plot

#=== network analysis
import networkx as nx
#=== gavity modelling
import gme as gme

#=== distance datasets
import wbdata


  import pandas.util.testing as tm


#### Import module and declare path variables
`import s_filepaths.py`

In [3]:
# import ref file
import s_filepaths

# declare local variables to work with
path_raw = s_filepaths.path_raw
path_raw_dl = s_filepaths.path_raw_dl
path_store = s_filepaths.path_store
path_live = s_filepaths.path_live

<a id="ID_part1"></a>
### Part 1 | Load and Merge
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [4]:
# View all files in live directory
print(os.listdir(f"{path_live}"))

['121_input_di_matrix.csv.gzip', '.DS_Store', '112_input_gme_data.csv.gzip', '120_input_scope_data.csv.gzip', '2_raw_explainer_doc.md', '121_input_de_matrix.csv.gzip', '113_input_bri_members.csv.gzip']


In [5]:
# import gme data (121)
df_scope_raw = pd.read_csv(f"{path_live}120_input_scope_data.csv.gzip",compression="gzip")
df_scope_raw.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.8744,0,9693.0625,,
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0
4,2016,Czech Republic,CZE,CZE,1,0,europe,,,49.817062,15.696862,Holy See,VAT,VAT,1,0,europe,,41.900013,12.447808,0,925.0072,2015.0,1.0


In [6]:
# import distance measures (120)
df_de = pd.read_csv(f"{path_live}121_input_de_matrix.csv.gzip",compression="gzip",index_col="index")
df_di = pd.read_csv(f"{path_live}121_input_di_matrix.csv.gzip",compression="gzip",index_col="index")

#df_di.head()

In [7]:
# create series based on combo of iso3_o and iso3_d
paired_values = []

for entry in list(df_scope_raw.index):
    # pair of countries in gravity dataset
    df_pair = (df_scope_raw.iloc[entry].iso3_o, df_scope_raw.iloc[entry].iso3_d)

    try: 
        distance_value = df_de.loc[df_pair[0],df_pair[1]]
        paired_values.append(distance_value)
    except:
        paired_values.append(np.NaN)
        
df_scope_raw["economic_distance"] = paired_values

In [8]:
# create series based on combo of iso3_o and iso3_d
paired_values = []

for entry in list(df_scope_raw.index):
    # pair of countries in gravity dataset
    df_pair = (df_scope_raw.iloc[entry].iso3_o, df_scope_raw.iloc[entry].iso3_d)

    try: 
        distance_value = df_di.loc[df_pair[0],df_pair[1]]
        paired_values.append(distance_value)
    except:
        paired_values.append(np.NaN)
        
df_scope_raw["institutional_distance"] = paired_values

In [9]:
df_scope_raw

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d,economic_distance,institutional_distance
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0,1.047448e+09,0.028180
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0,9.189585e+08,0.151204
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.874400,0,9693.0625,,,2.019057e+09,3.941229
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.002470,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0,1.724850e+07,0.000004
4,2016,Czech Republic,CZE,CZE,1,0,europe,,,49.817062,15.696862,Holy See,VAT,VAT,1,0,europe,,41.900013,12.447808,0,925.0072,2015.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7651,2016,Yemen,YEM,YEM.X,0,0,middle_east,,,14.599034,45.970097,Tajikistan,TJK,TJK,1,0,central_asia,,38.893276,69.511780,0,3639.5378,2017.0,1.0,,
7652,2016,Yemen,YEM,YEM.X,0,0,middle_east,,,14.599034,45.970097,Singapore,SGP,SGP,0,0,south_east_asia,,1.293033,103.855820,0,6711.3677,2017.0,1.0,,
7653,2016,Yemen,YEM,YEM.X,0,0,middle_east,,,14.599034,45.970097,Oman,OMN,OMN,0,0,middle_east,,21.833725,57.275669,1,1684.7964,2017.0,1.0,,
7654,2016,Yemen,YEM,YEM.X,0,0,middle_east,,,14.599034,45.970097,East Timor,TLS,TLS,0,0,south_east_asia,,-8.559388,125.579450,0,9330.5391,2017.0,1.0,,


<a id="ID_part2"></a>
### Part 2 | Final scope filter
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [24]:
# remove any entries with no distance measurement
df_scope = df_scope_raw[~df_scope_raw.economic_distance.isnull()].copy()

In [25]:
df_scope

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d,economic_distance,institutional_distance
0,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.42140,2017.0,1.0,1.047448e+09,0.028180
1,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.60860,2015.0,1.0,9.189585e+08,0.151204
2,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.874400,0,9693.06250,,,2.019057e+09,3.941229
3,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.002470,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.33640,2013.0,1.0,1.724850e+07,0.000004
5,2016,Spain,ESP,ESP,0,0,europe,,,39.231297,-4.263840,Belgium,BEL,BEL,0,0,europe,,50.691814,4.581812,0,1374.93510,,,1.282899e+08,0.586379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7562,2016,Vietnam,VNM,VNM.X,0,0,south_east_asia,,,15.463247,106.562180,Turkey,TUR,TUR,0,0,eurasia,,39.124500,34.841019,0,7749.51270,2017.0,1.0,3.903100e+08,0.031648
7563,2016,Vietnam,VNM,VNM.X,0,0,south_east_asia,,,15.463247,106.562180,Laos,LAO,LAO,1,0,south_east_asia,,18.160072,103.741520,1,777.79956,2017.0,1.0,7.315145e+04,0.116214
7564,2016,Vietnam,VNM,VNM.X,0,0,south_east_asia,,,15.463247,106.562180,Finland,FIN,FIN,0,0,europe,,62.738297,25.585251,0,8296.92380,2017.0,1.0,1.471245e+09,3.369079
7565,2016,Vietnam,VNM,VNM.X,0,0,south_east_asia,,,15.463247,106.562180,Serbia,SRB,SRB.X,1,0,europe,,44.679684,20.506792,0,8689.12110,2017.0,1.0,8.392826e+07,0.007856


***

<a id="ID_part3"></a>
### Part 3 | Export
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [29]:
# Export for visualisation
file_name = f"scope_country"
s_file_export.f_df_export(df_scope,file_name,p_file_id=f"{notebook_id}_")

Export | ../Data/1_raw_processed_backup/125_store_scope_country_20200808_2121.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/125_input_scope_country.csv.gzip | COMPLETE
