# This notebbook merges and validates the datasets used for trade modelling

In [93]:
import pandas as pd
import numpy
import requests
import json
import codecs
import pickle
from collections import defaultdict
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time

In [94]:
###############
# Import datasets
###############

# Country name mapping table
country_mapping_table = pd.read_csv('../Input_datasets/Yearly_trade_data/country_codes.csv')
iso_alpha3_numeric_mapper = dict(zip(country_mapping_table['ISO3166-1-Alpha-3'], country_mapping_table['ISO3166-1-numeric']))
iso_alpha2_numeric_mapper = dict(zip(country_mapping_table['ISO3166-1-Alpha-2'], country_mapping_table['ISO3166-1-numeric']))

# Trade dataset
trade_dataset = pd.read_csv('../Input_datasets/Yearly_trade_data/trade_data_new_annual_import_zero_padded.csv')
country_names_in_trade_dataset = pd.read_csv('../Input_datasets/country_names_with_annual_trade_data.csv')
trade_dataset['Period'] = trade_dataset['Period'].astype(int)

# Additional edge characteristics
cepii_edge_dataset = pd.read_csv('../Input_datasets/Yearly_trade_data/cepii_edge.csv')
# Drop countries that cannot be mapped
cepii_edge_dataset = cepii_edge_dataset[(cepii_edge_dataset['iso_o']!='ANT') & (cepii_edge_dataset['iso_d']!='ANT')
                                        & (cepii_edge_dataset['iso_o']!='PAL') & (cepii_edge_dataset['iso_d']!='PAL')
                                        & (cepii_edge_dataset['iso_o']!='TMP') & (cepii_edge_dataset['iso_d']!='TMP')
                                        & (cepii_edge_dataset['iso_o']!='YUG') & (cepii_edge_dataset['iso_d']!='YUG')
                                        & (cepii_edge_dataset['iso_o']!='ZAR') & (cepii_edge_dataset['iso_d']!='ZAR')]
cepii_edge_dataset['iso_o'] = cepii_edge_dataset['iso_o'].apply(lambda x: x if x!='ROM' else 'ROU')
cepii_edge_dataset['iso_d'] = cepii_edge_dataset['iso_d'].apply(lambda x: x if x!='ROM' else 'ROU')

cepii_edge_dataset['iso_o'] = cepii_edge_dataset['iso_o'].astype('string').apply(lambda x: iso_alpha3_numeric_mapper[x])
cepii_edge_dataset['iso_d'] = cepii_edge_dataset['iso_d'].astype('string').apply(lambda x: iso_alpha3_numeric_mapper[x])

# Node characteristics
wbg_dataset = pd.read_csv('../Input_datasets/Yearly_trade_data/WBG_data_all_countries.csv')
wbg_dataset['economy'] = wbg_dataset['economy'].astype('string').apply(lambda x: iso_alpha3_numeric_mapper[x])
wbg_dataset.columns = ['economy', 'year', 'gdp', 'total_population', 'urban_population(%_of_total)']
wbg_dataset['year'] = wbg_dataset['year'].apply(lambda x: x[2:])
wbg_dataset['year'] = wbg_dataset['year'].astype(int)

country_groups_dataset = pd.read_csv('../Input_datasets/Yearly_trade_data/country_groups.csv')
# Drop countries that cannot be mapped
country_groups_dataset = country_groups_dataset[(country_groups_dataset['country_code']!='JA')]
country_groups_dataset['country_code'] = country_groups_dataset['country_code'].apply(lambda x: iso_alpha2_numeric_mapper[x])

cepii_nodes_dataset = pd.read_csv('../Input_datasets/Yearly_trade_data/cepii_node.csv')
# Drop countries that cannot be mapped
cepii_nodes_dataset = cepii_nodes_dataset[(cepii_nodes_dataset['iso3']!='ANT')
                                        & (cepii_nodes_dataset['iso3']!='PAL') 
                                        & (cepii_nodes_dataset['iso3']!='TMP')
                                        & (cepii_nodes_dataset['iso3']!='YUG')
                                        & (cepii_nodes_dataset['iso3']!='ZAR')
                                        ]
cepii_nodes_dataset['iso3'] = cepii_nodes_dataset['iso3'].apply(lambda x: x if x!='ROM' else 'ROU')
cepii_nodes_dataset['iso3'] = cepii_nodes_dataset['iso3'].apply(lambda x: iso_alpha3_numeric_mapper[x])


In [95]:
###############
# Validate trade data
###############

# Obtain list of countries from trade dataset and validate

reporter_set = set(trade_dataset.Reporter.unique())
partner_set = set(trade_dataset.Partner.unique())

if (reporter_set - partner_set != set()) & (partner_set - reporter_set != set()):
    print('Number of partners and reporters do no match!')

periods = set(trade_dataset.Period.unique())
all_pairs = set([(i,j,k) for i in reporter_set for j in partner_set for k in periods])
real_pairs = set(list(trade_dataset[['Reporter', 'Partner', 'Period']].itertuples(index=False, name=None)))

if (all_pairs - real_pairs != set()) & (real_pairs - all_pairs != set()):
    print('Number of expected and real observations do no match!')

# Clean errorous codes

iso_codes_in_country_names_in_trade_dataset = set(country_names_in_trade_dataset['ISO_3166-1_numeric_code'])
if (reporter_set - iso_codes_in_country_names_in_trade_dataset != set()) & (iso_codes_in_country_names_in_trade_dataset - reporter_set != set()):
    print('ISO codes in trade dataset and country_names_in_trade_dataset do not match!')

errorous_country_code_mapper = dict(zip(reporter_set, reporter_set))
errorous_country_code_mapper[251] = 250
errorous_country_code_mapper[579] = 578
errorous_country_code_mapper[699] = 356
errorous_country_code_mapper[757] = 756
errorous_country_code_mapper[842] = 840

trade_dataset['Reporter'] = trade_dataset['Reporter'].apply(lambda x: errorous_country_code_mapper[x])
trade_dataset['Partner'] = trade_dataset['Partner'].apply(lambda x: errorous_country_code_mapper[x])

In [96]:
###############
# Merge cepii to trade data
###############

trade_dataset['Reporter'] = trade_dataset['Reporter'].astype('float64')
trade_dataset['Partner'] = trade_dataset['Partner'].astype('float64')

trade_edgelist = pd.merge(trade_dataset, cepii_edge_dataset, left_on = ['Reporter', 'Partner'], right_on = ['iso_o','iso_d'], how='left')
trade_edgelist['iso_o'] = trade_edgelist['iso_o'].astype('int')
trade_edgelist['iso_d'] = trade_edgelist['iso_d'].astype('int')
trade_edgelist.drop(columns=['Reporter', 'Partner'], inplace=True)

###############
# Filter wbg to trade data countries
# Merge with cepii_nodes and country groups
###############

trade_dataset_countries = trade_dataset['Reporter'].unique()
wbg_dataset = wbg_dataset[wbg_dataset['economy'].isin(trade_dataset_countries)]

trade_nodelist = pd.merge(wbg_dataset, country_groups_dataset, left_on = ['economy'], right_on = ['country_code'], how='left')
trade_nodelist = pd.merge(trade_nodelist, cepii_nodes_dataset, left_on = ['economy'], right_on = ['iso3'], how='left')
trade_nodelist['iso_numeric'] = trade_nodelist['iso3'].astype('int')
trade_nodelist.drop(columns=['economy', 'country_code', 'iso3'], inplace=True)


