In [1]:
notebook_id = "112"
work_online = False

<a id="ID_top"></a>
## Notebook 112 | GME data extractor

This workflow extracts fresh data from the [usitc](https://catalog.data.gov/dataset/dynamic-gravity-dataset-1948-2016/resource/d42cc18f-871c-4a02-a918-ad0f6e2c879b) website and downloads it OR works from a locally saved drive to i) describe and ii) save it for use.

Source site: https://catalog.data.gov/dataset/dynamic-gravity-dataset-1948-2016/resource/d42cc18f-871c-4a02-a918-ad0f6e2c879b

#### Notebook sections:  
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) ||

#### Import all packages that could be required

In [2]:
# %load s_package_import.py
import os # use with os.listdir(_path_)

# data analysis packages
import pandas as pd
pd.options.display.max_columns = None # don't truncate columns
#pd.options.display.max_rows = None
import numpy as np

# custom scripts
import s_file_export
import s_filepaths
import s_un_comtrade_extract as s_un
import s_adj_matrix_plot

#=== gavity modelling
import gme as gme

  import pandas.util.testing as tm


#### Import module and declare path variables
`import s_filepaths.py`

In [3]:
# import ref file
import s_filepaths

# declare local variables to work with
path_raw = s_filepaths.path_raw
path_raw_dl = s_filepaths.path_raw_dl
path_store = s_filepaths.path_store
path_live = s_filepaths.path_live

<a id="ID_part1"></a>
### Part 1 | (Down)load Dynamic Gravity Dataset data
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) ||

**Load in belt and road initiative countries**

In [4]:
# View all files in live directory
#print(os.listdir(f"{path_live}"))

In [5]:
# Get Dynamic Gravity Dataset from online source
if work_online:
    df_gme_raw = pd.read_csv(f"https://www.usitc.gov/documents/gravity/release_1.0_2005_2016.csv")
    # Export raw file
    file_name = f"gme_raw_site_dl"
    s_file_export.f_df_export(df_gme_raw,file_name,p_file_id=f"{notebook_id}_",p_loc2 = path_raw,p_loc2_pre= "raw_")
else:
    print("Notebook will attempt to use local files")

Notebook will attempt to use local files


In [6]:
# Load local or newly downloaded file
df_gme = pd.read_csv(f"{path_raw}112_raw_gme_raw_site_dl.csv.gzip",compression="gzip")

In [7]:
df_gme.head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,gdp_pwt_cur_d,capital_cur_d,capital_const_d,gdp_wdi_cur_d,gdp_wdi_const_d,gdp_wdi_cap_cur_d,gdp_wdi_cap_const_d,lat_d,lng_d,polity_d,polity_abs_d,country_o,iso3_o,dynamic_code_o,landlocked_o,island_o,region_o,gdp_pwt_const_o,pop_o,gdp_pwt_cur_o,capital_cur_o,capital_const_o,gdp_wdi_cur_o,gdp_wdi_const_o,gdp_wdi_cap_cur_o,gdp_wdi_cap_const_o,lat_o,lng_o,polity_o,polity_abs_o,contiguity,agree_pta_goods,agree_pta_services,agree_cu,agree_eia,agree_fta,agree_psa,agree_pta,sanction_threat,sanction_threat_trade,sanction_imposition,sanction_imposition_trade,member_eu_o,member_wto_o,member_gatt_o,member_eu_d,member_wto_d,member_gatt_d,member_eu_joint,member_wto_joint,member_gatt_joint,hostility_level_o,hostility_level_d,distance,common_language,colony_of_destination_after45,colony_of_destination_current,colony_of_destination_ever,colony_of_origin_after45,colony_of_origin_current,colony_of_origin_ever
0,2005,Aruba,ABW,ABW,0,1,caribbean,3906.5203,0.100031,4093.2434,23531.377,24173.982,2331006000.0,,23302.831988,,12.530384,-70.028992,,,Netherlands Antilles,ANT,ANT.X,0,0,caribbean,,,,,,,,,,12.250778,-69.301224,,,0,1,0,0,0,1,0,1,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,120.05867,1,0,0,0,0,0,0
1,2006,Aruba,ABW,ABW,0,1,caribbean,4118.1396,0.10083,4217.0669,25757.818,25396.307,2421475000.0,,24015.420612,,12.530384,-70.028992,,,Anguilla,AIA,AIA,0,1,caribbean,348.7688,0.012903,365.93643,2471.682,2342.796,,,,,18.217348,-63.057232,,,0,1,0,0,0,1,0,1,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,978.77728,1,0,0,0,0,0,0
2,2007,Aruba,ABW,ABW,0,1,caribbean,4196.4634,0.101218,4248.4707,27375.447,26631.465,2623726000.0,,25921.538234,,12.530384,-70.028992,,,Sao Tome and Principe,STP,STP,0,1,africa,391.01483,0.160064,392.44177,1101.736,3205.526,145827400.0,167044600.0,911.057012,1043.611485,0.989202,7.072665,,,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,8563.6963,0,0,0,0,0,0,0
3,2008,Aruba,ABW,ABW,0,1,caribbean,4433.6772,0.101342,4441.8828,28639.586,27871.596,2791961000.0,,27549.889422,,12.530384,-70.028992,,,Andorra,AND,AND,1,0,europe,,,,,,4001201000.0,3675947000.0,46734.268282,42935.277871,42.5,1.516486,,,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,7562.6733,0,0,0,0,0,0,0
4,2009,Aruba,ABW,ABW,0,1,caribbean,4183.0449,0.101416,4304.9224,29400.539,29122.635,2498933000.0,,24640.421244,,12.530384,-70.028992,,,Philippines,PHL,PHL,0,1,south_east_asia,458079.81,91.641881,460142.72,1420047.0,1624159.0,168334600000.0,185437700000.0,1836.87412,2023.503659,11.817977,122.77502,8.0,8.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,1,1,0,0,0,0,0,0,0,0,16904.596,1,0,0,0,0,0,0


<a id="ID_part2"></a>
### Part 2 | Amend/correct data as needed
|| [0| Default imports](#ID_top) || [1|Part1](#ID_part1) || [2|Part2](#ID_part2) || [3|Part3](#ID_part3) || [4|Part4](#ID_part4) || [5|Part5](#ID_part5) ||

**Correction 1**
- Greece contiguity with Bulgaria

In [16]:
# demosntrate issue
print(df_gme[(df_gme.iso3_d == "GRC") & (df_gme.iso3_o == "BGR")].contiguity.value_counts())
print(df_gme[(df_gme.iso3_d == "BGR") & (df_gme.iso3_o == "GRC")].contiguity.value_counts())

0    12
Name: contiguity, dtype: int64
0    12
Name: contiguity, dtype: int64


In [28]:
# get index of values to replace
replace_index = list(df_gme[((df_gme.loc[:,"iso3_d"] == "GRC") & (df_gme.loc[:,"iso3_o"] == "BGR")) |
       ((df_gme.loc[:,"iso3_d"] == "BGR") & (df_gme.loc[:,"iso3_o"] == "GRC"))
      ].contiguity.index)

In [30]:
# check fix
df_gme.loc[replace_index,"contiguity"] = 1
print(df_gme[(df_gme.iso3_d == "GRC") & (df_gme.iso3_o == "BGR")].contiguity.value_counts())
print(df_gme[(df_gme.iso3_d == "BGR") & (df_gme.iso3_o == "GRC")].contiguity.value_counts())

1    12
Name: contiguity, dtype: int64
1    12
Name: contiguity, dtype: int64


In [31]:
df_gme_export = df_gme.copy()

In [32]:
# Export raw file
file_name = f"gme_data"
s_file_export.f_df_export(df_gme_export,file_name,p_file_id=f"{notebook_id}_")

Export | ../Data/1_raw_processed_backup/112_store_gme_data_20200809_1142.csv | COMPLETE
COPY   | ../Data/2_raw_processed_input/112_input_gme_data.csv.gzip | COMPLETE
