In [20]:
notebook_id = "000"

<a id="ID_top"></a>
## Raw data script

Takes files from `./0_raw/` repository and allows small bits of cleaning or correction and documentation of errors before saving into:
- `./1_raw_processed_backup/` with version control
- `./2_raw_processed_input/` to store the version to be used for live analysis in other scripts

**Last change:** 16.06.2020

#### Code sections:
    
|| [0|Top](#ID_top) || [1|Filepaths](#ID_paths) || [2|Filenames](#ID_names) || [3|Load](#ID_load) || [4|Correct](#ID_correct) || [5|Export](#ID_export) ||

In [1]:
#=== working with external files
# this function saves the content of a cell to the file, it silently overwrites any content in existing files
#%%writefile script_0_to_1.py

# this function loads the content of a file into a cell
#%load script_0_to_1.py

In [None]:
# %load s_package_import.py
# package library, use to ensure consistency across notebooks, refresh periodically
# general packages
import os # use with os.listdir(_path_)
import requests
import csv
import time
from datetime import datetime
from shutil import copyfile

#temp check
#from shutil import make_archive
import zipfile

# data analysis packages
import pandas as pd
pd.options.display.max_columns = None # don't truncate columns
#pd.options.display.max_rows = None

import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import descartes
import pycountry

# custom scripts
import s_file_export
import s_filepaths
import s_un_comtrade_extract as s_un
import s_adj_matrix_plot

#=== network analysis
import networkx as nx
#=== gavity modelling
import gme as gme


In [1]:
# country reference package


<a id="ID_paths"></a>
### Filepaths
|| [0|Top](#ID_top) || [1|Filepaths](#ID_paths) || [2|Filenames](#ID_names) || [3|Load](#ID_load) || [4|Correct](#ID_correct) || [5|Export](#ID_export) ||

In [2]:
# This script allows one to load and correct raw files before saving them again.

file_path_0_raw = "../Data/0_raw/"
file_path_1_backup = "../Data/1_raw_processed_backup/"
file_path_2_input = "../Data/2_raw_processed_input/"

<a id="ID_names"></a>
### Filenames

|| [0|Top](#ID_top) || [1|Filepaths](#ID_paths) || [2|Filenames](#ID_names) || [3|Load](#ID_load) || [4|Correct](#ID_correct) || [5|Export](#ID_export) ||

In [3]:
# list of all files
filenames = os.listdir(file_path_0_raw)
print(filenames)

# list of file names that can be read with same rule
file_to_batch_read = [filenames[4],filenames[0],filenames[3]]

file_to_batch_read = ["release_1.0_2005_2016.csv.zip","comtrade_5_country_mini.csv","Dumo_Yao_BRI_countries.csv","BRI_countries_online_MANUAL.csv"]

['BRI_countries_online_MANUAL.csv', '.DS_Store', 'comtrade_5_country_mini.csv', '0_raw_explainer_doc.md', '0_0_test.csv', '0_UN_comtrade_dl', 'WIOT2014_Nov16_ROW.xlsb', '1_auto_download', 'Dumo_Yao_BRI_countries.csv', 'release_1.0_2005_2016.csv.zip', 'China-multi-regional-input-output-MRIO-table-2012.xlsx']


In [4]:
file_to_batch_read

['release_1.0_2005_2016.csv.zip',
 'comtrade_5_country_mini.csv',
 'Dumo_Yao_BRI_countries.csv',
 'BRI_countries_online_MANUAL.csv']

<a id="ID_load"></a>
### Load

|| [0|Top](#ID_top) || [1|Filepaths](#ID_paths) || [2|Filenames](#ID_names) || [3|Load](#ID_load) || [4|Correct](#ID_correct) || [5|Export](#ID_export) ||

In [5]:
# list of datasets
dataset_list = []

# load files in batch
for file in file_to_batch_read:
    print(f"Processing | {file_path_0_raw}{file} ...")
    
    try:
        # read zip file and make sure only first file is passed on
        temp_zf = zipfile.ZipFile(f"{file_path_0_raw}{file}")    
        #temp_read = pd.read_csv(f"{file_path_0_raw}{temp_zf_first}")
        temp_read = pd.read_csv(temp_zf.open(zipfile.ZipFile.namelist(temp_zf)[0]))
    except:
        temp_read = pd.read_csv(f"{file_path_0_raw}{file}")
    dataset_list.append(temp_read)
    print(f"...done")

Processing | ../Data/0_raw/release_1.0_2005_2016.csv.zip ...
...done
Processing | ../Data/0_raw/comtrade_5_country_mini.csv ...
...done
Processing | ../Data/0_raw/Dumo_Yao_BRI_countries.csv ...
...done
Processing | ../Data/0_raw/BRI_countries_online_MANUAL.csv ...
...done


<a id="ID_correct"></a>
### Correct

|| [0|Top](#ID_top) || [1|Filepaths](#ID_paths) || [2|Filenames](#ID_names) || [3|Load](#ID_load) || [4|Correct](#ID_correct) || [5|Export](#ID_export) ||

In [6]:
# list object, remember to index to open correct table
dataset_list[0].head()

Unnamed: 0,year,country_d,iso3_d,dynamic_code_d,landlocked_d,island_d,region_d,gdp_pwt_const_d,pop_d,gdp_pwt_cur_d,...,hostility_level_o,hostility_level_d,distance,common_language,colony_of_destination_after45,colony_of_destination_current,colony_of_destination_ever,colony_of_origin_after45,colony_of_origin_current,colony_of_origin_ever
0,2005,Aruba,ABW,ABW,0,1,caribbean,3906.5203,0.100031,4093.2434,...,0,0,120.05867,1,0,0,0,0,0,0
1,2006,Aruba,ABW,ABW,0,1,caribbean,4118.1396,0.10083,4217.0669,...,0,0,978.77728,1,0,0,0,0,0,0
2,2007,Aruba,ABW,ABW,0,1,caribbean,4196.4634,0.101218,4248.4707,...,0,0,8563.6963,0,0,0,0,0,0,0
3,2008,Aruba,ABW,ABW,0,1,caribbean,4433.6772,0.101342,4441.8828,...,0,0,7562.6733,0,0,0,0,0,0,0
4,2009,Aruba,ABW,ABW,0,1,caribbean,4183.0449,0.101416,4304.9224,...,0,0,16904.596,1,0,0,0,0,0,0


#### Corrections | release_1.0_2005_2016.csv 

In [6]:
# release_1.0_2005_2016.csv
raw_df_dyn_grav_05_16 = dataset_list[0]

In [7]:
# preview problematic part of dataset
temp_column_view = ["country_d","country_o","contiguity"]

# filter on greece and bulgaria
raw_df_dyn_grav_05_16[
    # filter the Bulgaria - Greece combo or the Bulgaria - Greece combo
    ((raw_df_dyn_grav_05_16.iso3_d == "GRC") & (raw_df_dyn_grav_05_16.iso3_o == "BGR")) 
    |
    ((raw_df_dyn_grav_05_16.iso3_d == "BGR") & (raw_df_dyn_grav_05_16.iso3_o == "GRC"))
                     ].loc[:,temp_column_view].head()

Unnamed: 0,country_d,country_o,contiguity
71945,Bulgaria,Greece,0
72099,Bulgaria,Greece,0
72483,Bulgaria,Greece,0
72638,Bulgaria,Greece,0
72988,Bulgaria,Greece,0


In [8]:
# correct lack of contiguity between bulgaria and greece
temp_index = raw_df_dyn_grav_05_16[
    # filter the Bulgaria - Greece combo or the Bulgaria - Greece combo
    ((raw_df_dyn_grav_05_16.iso3_d == "GRC") & (raw_df_dyn_grav_05_16.iso3_o == "BGR")) 
    |
    ((raw_df_dyn_grav_05_16.iso3_d == "BGR") & (raw_df_dyn_grav_05_16.iso3_o == "GRC"))
                     ].index

In [9]:
# update values
raw_df_dyn_grav_05_16.loc[list(temp_index),"contiguity"] = 1

#### ./0_raw/comtrade_5_country_mini.csv | 

In [18]:
dataset_list[1].head()

Unnamed: 0,Classification,Year,Period,Period Desc.,Aggregate Level,Is Leaf Code,Trade Flow Code,Trade Flow,Reporter Code,Reporter,...,Qty,Alt Qty Unit Code,Alt Qty Unit,Alt Qty,Netweight (kg),Gross weight (kg),Trade Value (US$),CIF Trade Value (US$),FOB Trade Value (US$),Flag
0,H5,2018,2018,2018,0,0,1,Import,156,China,...,0.0,,,,0.0,,146381811975,,,4
1,H5,2018,2018,2018,0,0,4,Re-Import,156,China,...,0.0,,,,0.0,,146381811975,,,4
2,H5,2018,2018,2018,0,0,1,Import,156,China,...,0.0,,,,0.0,,106257241330,,,4
3,H5,2018,2018,2018,0,0,2,Export,156,China,...,0.0,,,,0.0,,77908711119,,,4
4,H5,2018,2018,2018,0,0,1,Import,156,China,...,0.0,,,,0.0,,180401786146,,,4


In [21]:
un_com_sample = dataset_list[1]

#### Dumo_Yao_BRI_countries.csv |

In [7]:
# loaded countries
try:
    dataset_list[3].drop("Unnamed: 1",axis = 1,inplace = True)
except:
    pass

df_bri_countries = dataset_list[3].copy()

dataset_list[3].head()

Unnamed: 0,Country,Region,Income category,Year MOU sign,MOU link,Comment
0,Afghanistan,South Asia,Low income,2016.0,,https://eng.yidaiyilu.gov.cn/zchj/sbwj/1425.htm
1,Albania,Europe & Central Asia,Upper middle income,2017.0,,
2,Algeria,Middle East & North Africa,Upper middle income,2018.0,,
3,Angola,Sub-Saharan Africa,Lower middle income,2018.0,,
4,Antigua and Barbuda,Latin America & Caribbean,High income,2018.0,https://eng.yidaiyilu.gov.cn/qwyw/rdxw/57191.htm,


In [8]:
# Match country to get iso_3
matched_value_iso3 = []
# All other matches for future reference
matched_value = []

for value in df_bri_countries.Country:
    try:
        temp_value = pycountry.countries.search_fuzzy(value)
        temp_iso = temp_value[0].alpha_3
    except:
        temp_value = "FAIL"
        temp_iso = "FAIL"
        print(value,"failed")
    
    matched_value_iso3.append(temp_iso)
    matched_value.append(temp_value)
    
df_bri_countries["iso_3"] = matched_value_iso3

Republic of Congo failed


In [16]:
# Lao did not match, fixing it here. [index 42]
df_bri_countries.iloc[26,6] = "COG"
df_bri_countries.iloc[86,6] = "NER" #Niger not Nigeria

In [15]:
df_bri_countries[df_bri_countries.iso_3 == "NGA"]

Unnamed: 0,Country,Region,Income category,Year MOU sign,MOU link,Comment,iso_3
86,Niger,Sub-Saharan Africa,Low income,,,,NGA
87,Nigeria,Sub-Saharan Africa,Lower middle income,2018.0,,,NGA


In [17]:
df_bri_countries.iso_3.value_counts().head()

LUX    1
IDN    1
AGO    1
OMN    1
ARM    1
Name: iso_3, dtype: int64

<a id="ID_export"></a>
### Export

|| [0|Top](#ID_top) || [1|Filepaths](#ID_paths) || [2|Filenames](#ID_names) || [3|Load](#ID_load) || [4|Correct](#ID_correct) || [5|Export](#ID_export) ||

In [53]:
# Files to export
file_export = [raw_df_dyn_grav_05_16,un_com_sample,df_bri_countries]
file_names = ["dynamic_gravity","un_sample","bri_countries_Dumor_Yao"]

NameError: name 'raw_df_dyn_grav_05_16' is not defined

In [18]:
def f_export(p_df,p_name):
    
    global file_path_1_backup,file_path_2_input
    
    # set current time and version
    temp_now = datetime.now()
    time_of_export = temp_now.strftime("%Y%m%d_%H%M")
    # Back up repository
    temp_back_up_name = f"{file_path_1_backup}store_{p_name}_{time_of_export}.csv"
    # export
    compression_type = "gzip"
    temp_compress_name = temp_back_up_name+"."+compression_type
    p_df.to_csv(temp_back_up_name+"."+compression_type,compression = compression_type)
    # copy to live folder
    temp_live_name = f"{file_path_2_input}input_{p_name}.csv.{compression_type}"
    copyfile(temp_compress_name,temp_live_name)

In [19]:
# BRI countries
f_export(df_bri_countries,"bri_countries_manual_2020")

In [None]:
f_export(un_com_sample,"un_sample")

f_export(raw_df_dyn_grav_05_16,"dynamic_gravity")

### Old reference code |

For loop to run through generated dataframes, replaced with function as this did not work when only one file changed.

In [20]:
# run through all files
for index,file in enumerate(file_export):
    # set current time and version
    temp_now = datetime.now()
    time_of_export = temp_now.strftime("%Y%m%d_%H%M")
    # Back up repository
    temp_back_up_name = f"{file_path_1_backup}store_{file_names[index]}_{time_of_export}.csv"
    # export
    compression_type = "gzip"
    temp_compress_name = temp_back_up_name+"."+compression_type
    file.to_csv(temp_back_up_name+"."+compression_type,compression = compression_type)
    # copy to live folder
    temp_live_name = f"{file_path_2_input}input_{file_names[index]}.csv.{compression_type}"
    copyfile(temp_compress_name,temp_live_name)