In [87]:
notebook_id = "120"

<a id="ID_top"></a>
## Merge data baseline

This notebook merges data from Dynamic Gravity Dataset (gme 112), with the BRI membership dataset (113).

Work pending: add flow data from UNCOMTRADE

#### Notebook sections:
    
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

#### Import all packages that could be required

In [5]:
# %load s_package_import.py
# package library, use to ensure consistency across notebooks, refresh periodically
# general packages
import os # use with os.listdir(_path_)
import requests
import csv
import time
from datetime import datetime
from shutil import copyfile

#temp check
#from shutil import make_archive
#import zipfile

# data analysis packages
import pandas as pd
pd.options.display.max_columns = None # don't truncate columns
#pd.options.display.max_rows = None

import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import descartes

# custom scripts
import s_file_export
import s_filepaths
import s_un_comtrade_extract as s_un
import s_adj_matrix_plot

#=== network analysis
import networkx as nx
#=== gavity modelling
import gme as gme


#### Import module and declare path variables
`import s_filepaths.py`

In [6]:
# import ref file
import s_filepaths

# declare local variables to work with
path_raw = s_filepaths.path_raw
path_raw_dl = s_filepaths.path_raw_dl
path_store = s_filepaths.path_store
path_live = s_filepaths.path_live

<a id="ID_part1"></a>
### Part 1 | Load and merge datasets
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [4]:
# View all files in live directory
print(os.listdir(f"{path_live}"))

['.DS_Store', '112_input_gme_data.csv.gzip', '2_raw_explainer_doc.md', '113_input_bri_members.csv.gzip']


**GME dataset**

In [22]:
df_gme = pd.read_csv(f"{path_live}112_input_gme_data.csv.gzip",compression="gzip")
# preview gme data
df_gme.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,gdp_pwt_cur_d,capital_cur_d,capital_const_d,gdp_wdi_cur_d,gdp_wdi_const_d,gdp_wdi_cap_cur_d,gdp_wdi_cap_const_d,lat_d,lng_d,polity_d,polity_abs_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,gdp_pwt_const_o,pop_o,gdp_pwt_cur_o,capital_cur_o,capital_const_o,gdp_wdi_cur_o,gdp_wdi_const_o,gdp_wdi_cap_cur_o,gdp_wdi_cap_const_o,lat_o,lng_o,polity_o,polity_abs_o,contiguity,agree_pta_goods,agree_pta_services,agree_cu,agree_eia,agree_fta,agree_psa,agree_pta,sanction_threat,sanction_threat_trade,sanction_imposition,sanction_imposition_trade,member_eu_o,member_wto_o,member_gatt_o,member_eu_d,member_wto_d,member_gatt_d,member_eu_joint,member_wto_joint,member_gatt_joint,hostility_level_o,hostility_level_d,distance,common_language,colony_of_destination_after45,colony_of_destination_current,colony_of_destination_ever,colony_of_origin_after45,colony_of_origin_current,colony_of_origin_ever
0,2005,Aruba,ABW,ABW,0,1,caribbean,3906.5203,0.100031,4093.2434,23531.377,24173.982,2331006000.0,,23302.831988,,12.530384,-70.028992,,,Netherlands Antilles,ANT,ANT.X,0,0,caribbean,,,,,,,,,,12.250778,-69.301224,,,0,1,0,0,0,1,0,1,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,120.05867,1,0,0,0,0,0,0
1,2006,Aruba,ABW,ABW,0,1,caribbean,4118.1396,0.10083,4217.0669,25757.818,25396.307,2421475000.0,,24015.420612,,12.530384,-70.028992,,,Anguilla,AIA,AIA,0,1,caribbean,348.7688,0.012903,365.93643,2471.682,2342.796,,,,,18.217348,-63.057232,,,0,1,0,0,0,1,0,1,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,978.77728,1,0,0,0,0,0,0
2,2007,Aruba,ABW,ABW,0,1,caribbean,4196.4634,0.101218,4248.4707,27375.447,26631.465,2623726000.0,,25921.538234,,12.530384,-70.028992,,,Sao Tome and Principe,STP,STP,0,1,africa,391.01483,0.160064,392.44177,1101.736,3205.526,145827400.0,167044600.0,911.057012,1043.611485,0.989202,7.072665,,,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,8563.6963,0,0,0,0,0,0,0
3,2008,Aruba,ABW,ABW,0,1,caribbean,4433.6772,0.101342,4441.8828,28639.586,27871.596,2791961000.0,,27549.889422,,12.530384,-70.028992,,,Andorra,AND,AND,1,0,europe,,,,,,4001201000.0,3675947000.0,46734.268282,42935.277871,42.5,1.516486,,,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,7562.6733,0,0,0,0,0,0,0
4,2009,Aruba,ABW,ABW,0,1,caribbean,4183.0449,0.101416,4304.9224,29400.539,29122.635,2498933000.0,,24640.421244,,12.530384,-70.028992,,,Philippines,PHL,PHL,0,1,south_east_asia,458079.81,91.641881,460142.72,1420047.0,1624159.0,168334600000.0,185437700000.0,1836.87412,2023.503659,11.817977,122.77502,8.0,8.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,1,1,0,0,0,0,0,0,0,0,16904.596,1,0,0,0,0,0,0


In [23]:
# drop columns that we do not need
df_gme.columns

Index(['year', 'country_d', 'iso3_d', 'dynamic_code_d', 'landlocked_d',
       'island_d', 'region_d', 'gdp_pwt_const_d', 'pop_d', 'gdp_pwt_cur_d',
       'capital_cur_d', 'capital_const_d', 'gdp_wdi_cur_d', 'gdp_wdi_const_d',
       'gdp_wdi_cap_cur_d', 'gdp_wdi_cap_const_d', 'lat_d', 'lng_d',
       'polity_d', 'polity_abs_d', 'country_o', 'iso3_o', 'dynamic_code_o',
       'landlocked_o', 'island_o', 'region_o', 'gdp_pwt_const_o', 'pop_o',
       'gdp_pwt_cur_o', 'capital_cur_o', 'capital_const_o', 'gdp_wdi_cur_o',
       'gdp_wdi_const_o', 'gdp_wdi_cap_cur_o', 'gdp_wdi_cap_const_o', 'lat_o',
       'lng_o', 'polity_o', 'polity_abs_o', 'contiguity', 'agree_pta_goods',
       'agree_pta_services', 'agree_cu', 'agree_eia', 'agree_fta', 'agree_psa',
       'agree_pta', 'sanction_threat', 'sanction_threat_trade',
       'sanction_imposition', 'sanction_imposition_trade', 'member_eu_o',
       'member_wto_o', 'member_gatt_o', 'member_eu_d', 'member_wto_d',
       'member_gatt_d', 'member

Selecting columns that tell us basic descriptive information about the pair of countries. Ommitting membership of internationla organisations and mutual free-trade agreements

In [26]:
list_gme_columns_keep = ["year","country_d","iso3_d","dynamic_code_d",'landlocked_d',
                         'island_d', 'region_d', 'gdp_pwt_const_d', 'pop_d','lat_d', 'lng_d','country_o',
                         'iso3_o', 'dynamic_code_o','landlocked_o', 'island_o', 'region_o','pop_o','lat_o',
                         'lng_o', 'contiguity', 'distance']

In [30]:
df_gme_keep = df_gme.loc[:,list_gme_columns_keep].copy()
df_gme_keep.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance
0,2005,Aruba,ABW,ABW,0,1,caribbean,3906.5203,0.100031,12.530384,-70.028992,Netherlands Antilles,ANT,ANT.X,0,0,caribbean,,12.250778,-69.301224,0,120.05867
1,2006,Aruba,ABW,ABW,0,1,caribbean,4118.1396,0.10083,12.530384,-70.028992,Anguilla,AIA,AIA,0,1,caribbean,0.012903,18.217348,-63.057232,0,978.77728
2,2007,Aruba,ABW,ABW,0,1,caribbean,4196.4634,0.101218,12.530384,-70.028992,Sao Tome and Principe,STP,STP,0,1,africa,0.160064,0.989202,7.072665,0,8563.6963
3,2008,Aruba,ABW,ABW,0,1,caribbean,4433.6772,0.101342,12.530384,-70.028992,Andorra,AND,AND,1,0,europe,,42.5,1.516486,0,7562.6733
4,2009,Aruba,ABW,ABW,0,1,caribbean,4183.0449,0.101416,12.530384,-70.028992,Philippines,PHL,PHL,0,1,south_east_asia,91.641881,11.817977,122.77502,0,16904.596


**BRI dataset**

In [20]:
df_bri = pd.read_csv(f"{path_live}113_input_bri_members.csv.gzip",compression="gzip")
# preview bri data
#df_bri.head()

In [71]:
# select columns to keep for merging
df_bri_keep = df_bri.loc[:,["iso_3","Year MOU sign","bri_flag"]]
# rename columns
df_bri_keep.columns = ["iso_3","bri_year_d","bri_flag_d"]
df_bri_keep.set_index("iso_3",inplace = True)
# preview
df_bri_keep.head()

Unnamed: 0_level_0,bri_year_d,bri_flag_d
iso_3,Unnamed: 1_level_1,Unnamed: 2_level_1
AFG,2016.0,1
ALB,2017.0,1
DZA,2018.0,1
AGO,2018.0,1
ATG,2018.0,1


<a id="ID_part2"></a>
### Part 2 | Merge GME to BRI
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [72]:
df_gme_bri = df_gme_keep.merge(df_bri_keep,how = "left",left_on="iso3_d",right_on=df_bri_keep.index)

<a id="ID_part3"></a>
### Part 3 | Scope countries
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [73]:
df_scope = df_gme_bri.copy()
print(f"Basic stats | DF size: {len(df_scope)}")
df_scope.head()

Basic stats | DF size: 746048


Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d
0,2005,Aruba,ABW,ABW,0,1,caribbean,3906.5203,0.100031,12.530384,-70.028992,Netherlands Antilles,ANT,ANT.X,0,0,caribbean,,12.250778,-69.301224,0,120.05867,,
1,2006,Aruba,ABW,ABW,0,1,caribbean,4118.1396,0.10083,12.530384,-70.028992,Anguilla,AIA,AIA,0,1,caribbean,0.012903,18.217348,-63.057232,0,978.77728,,
2,2007,Aruba,ABW,ABW,0,1,caribbean,4196.4634,0.101218,12.530384,-70.028992,Sao Tome and Principe,STP,STP,0,1,africa,0.160064,0.989202,7.072665,0,8563.6963,,
3,2008,Aruba,ABW,ABW,0,1,caribbean,4433.6772,0.101342,12.530384,-70.028992,Andorra,AND,AND,1,0,europe,,42.5,1.516486,0,7562.6733,,
4,2009,Aruba,ABW,ABW,0,1,caribbean,4183.0449,0.101416,12.530384,-70.028992,Philippines,PHL,PHL,0,1,south_east_asia,91.641881,11.817977,122.77502,0,16904.596,,


In [74]:
# see all regions
df_gme_bri.region_d.unique()

array(['caribbean', 'middle_east', 'africa', 'europe', 'south_america',
       'pacific', 'southern_pole', 'south_asia', 'central_america',
       'south_east_asia', 'north_america', 'east_asia', 'eurasia',
       'central_asia'], dtype=object)

In [82]:
# remove islands
df_scope = df_scope[(df_scope.island_d == 0) & (df_scope.island_o == 0)].copy()

# focus on europe and asia
df_scope = df_scope[
    # filter all relevant destination countries
    ((df_scope.region_d.str.contains("europe")) | (df_scope.region_d.str.contains("asia")) | (df_scope.region_d.str.contains("middle"))) & 
    
    # filter all origin countries
    ((df_scope.region_o.str.contains("europe")) | (df_scope.region_o.str.contains("asia")) | (df_scope.region_o.str.contains("middle")))
                   ].copy()

# only include latest year (2016)
df_scope = df_scope[df_scope.year == 2016].copy()

# avoid same country pairs


In [83]:
print(f"Basic stats | DF size: {len(df_scope)}")
df_scope.head()

Basic stats | DF size: 7744


Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,lat_d,lng_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,pop_o,lat_o,lng_o,contiguity,distance,bri_year_d,bri_flag_d
101,2016,United Arab Emirates,ARE,ARE,0,0,middle_east,,,25.094378,55.454674,Portugal,PRT,PRT,0,0,europe,,39.310741,-10.883486,0,6154.4214,2017.0,1.0
125,2016,Armenia,ARM,ARM,1,0,europe,,,40.379307,44.671028,Armenia,ARM,ARM,1,0,europe,,40.379307,44.671028,0,27.876888,2015.0,1.0
209,2016,Azerbaijan,AZE,AZE,0,0,europe,,,40.362438,47.255833,Kuwait,KWT,KWT,0,0,middle_east,,29.279369,47.930035,0,1236.6086,2015.0,1.0
233,2016,Belgium,BEL,BEL,0,0,europe,,,50.691814,4.581812,Cambodia,KHM,KHM,0,0,south_east_asia,,12.320902,104.8744,0,9693.0625,,
527,2016,China,CHN,CHN,0,0,east_asia,,,35.389668,114.00247,Belarus,BLR,BLR,1,0,europe,,53.604687,27.802185,0,6904.3364,2013.0,1.0


In [84]:
# compare destination and origin countries
print(f"Compare merged df country lengths: {len(df_scope.country_d.unique()) - len(df_scope.country_o.unique())}")
print(f"Compare merged df country members with set: {set(df_scope.country_d.unique()) - set(df_scope.country_o.unique())}")

Compare merged df country lengths: 0
Compare merged df country members with set: set()


<a id="ID_part4"></a>
### Part 4 | Export data in scope
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

In [86]:
# Save to live
filename = "scope_data"
s_file_export.f_df_export(df_scope,filename,p_file_id=f"{notebook_id}_")

Export | ../Data/1_raw_processed_backup/120_store_scope_data_20200808_2029.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/120_input_scope_data.csv.gzip | COMPLETE
