### Combine Primary and Secondary Datasets

This file is to merge primary data (USA Export and Imports) and secondary data (GDP and MFN tarrif data)
which involves following steps

1. Rename column names so that they are consistent in primary and secondary datasets.
2. Convert to approriate data types
3. Filling missing values
4. Merging primary and seconday data
5. Writing final data to a csv

In [10]:
# import required packages
import pandas as pd
import numpy as np

In [11]:
# read cleaned and aggreated primary(trade) data from csv
primary = pd.read_csv('./../../data/processed/cleaned_primary_trade_data.csv')
primary.columns = primary.columns.str.lower()
primary = primary.rename(columns={'standardized_country': 'country'})

# Remove commas and convert to integer
primary['import_value'] = primary['import_value'].str.replace(',', '').astype(int)
primary['export_value'] = primary['export_value'].str.replace(',', '').astype(int)

# print first 5 rows
primary.head(5)

Unnamed: 0,country,category,year,import_value,export_value
0,Mexico,Passenger Vehicles,2008,19795290562,4053831183
1,Japan,Passenger Vehicles,2008,40975127629,388126905
2,South Korea,Passenger Vehicles,2008,7413089740,228776236
3,Canada,Passenger Vehicles,2008,33575452102,17859000199
4,Germany,Passenger Vehicles,2008,18282476127,7271930736


In [12]:
# check if nulls values are filled
primary.isnull().sum()

country         0
category        0
year            0
import_value    0
export_value    0
dtype: int64

In [13]:
# read cleaned and aggreated secondary(GDP and MFN Tariff) data from csv
secondary = pd.read_csv('./../../data/processed/final_gdp_tariff.csv')
secondary.columns = secondary.columns.str.lower()
secondary = secondary.rename(columns={'standardized_country': 'country'})

# round MFN Tariff to 2 decimal places
secondary['mfn_by_us_simple_avg'] = secondary['mfn_by_us_simple_avg'].round(2)
secondary['mfn_by_us_weighted_avg'] = secondary['mfn_by_us_weighted_avg'].round(2)
secondary['mfn_on_us_simple_avg'] = secondary['mfn_on_us_simple_avg'].round(2)
secondary['mfn_on_us_weighted_avg'] = secondary['mfn_on_us_weighted_avg'].round(2)

# round GDP and GDP 2015 adj to nearest integer and convert to Int64 type
secondary['gdp'] = np.floor(pd.to_numeric(secondary['gdp'], errors='coerce')).astype('Int64')
secondary['gdp_2015_adj'] = np.floor(pd.to_numeric(secondary['gdp_2015_adj'], errors='coerce')).astype('Int64')

# print first 5 rows
secondary.head(5)

Unnamed: 0,country,year,mfn_by_us_simple_avg,mfn_by_us_weighted_avg,mfn_on_us_simple_avg,mfn_on_us_weighted_avg,gdp,gdp_2015_adj
0,Afghanistan,2008,3.31,1.64,7.68,5.48,10109297047,11060395115
1,Afghanistan,2009,2.83,1.42,,,12416152732,13426272073
2,Afghanistan,2010,2.44,1.15,,,15856668555,15354612541
3,Afghanistan,2011,3.82,1.36,,,17805098206,15420077665
4,Afghanistan,2012,3.69,1.52,,,19907329777,17386490239


In [14]:
# sort by country and year
secondary = secondary.sort_values(by=['country', 'year'])

# interpolate only numeric columns per country
interpolate_cols = ['mfn_by_us_simple_avg', 'mfn_by_us_weighted_avg',
       'mfn_on_us_simple_avg', 'mfn_on_us_weighted_avg', 'gdp',
       'gdp_2015_adj']

secondary[interpolate_cols] = (
    secondary
    .groupby('country')[interpolate_cols]
    .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
    .reset_index(drop=True)
)

secondary.head(5)


Unnamed: 0,country,year,mfn_by_us_simple_avg,mfn_by_us_weighted_avg,mfn_on_us_simple_avg,mfn_on_us_weighted_avg,gdp,gdp_2015_adj
0,Afghanistan,2008,3.31,1.64,7.68,5.48,10109297047.0,11060395115.0
1,Afghanistan,2009,2.83,1.42,7.911,5.715,12416152732.0,13426272073.0
2,Afghanistan,2010,2.44,1.15,8.142,5.95,15856668555.0,15354612541.0
3,Afghanistan,2011,3.82,1.36,8.373,6.185,17805098206.0,15420077665.0
4,Afghanistan,2012,3.69,1.52,8.604,6.42,19907329777.0,17386490239.0


In [15]:
# check nulls 
secondary.isnull().sum()

country                     0
year                        0
mfn_by_us_simple_avg      195
mfn_by_us_weighted_avg    195
mfn_on_us_simple_avg      930
mfn_on_us_weighted_avg    930
gdp                       315
gdp_2015_adj              345
dtype: int64

For the major trading partners of the USA, there are no null values.
For other countries, some nulls remain in the GDP and MFN tariff columns.

Since interpolation requires at least a few existing values to estimate the missing ones, 
we will ignore those countries for now.
This approach is acceptable because those countries are not major trading partners of the USA.

In [16]:
# merge primary and secondary data on Year and Country so that 
# each trade record has corresponding GDP and MFN Tariff values
combined = pd.merge(primary, secondary, on=['year', 'country'], how='left')
combined.sort_values(by=['country','year','category', ], inplace=True)
combined = combined.reset_index(drop=True)

In [17]:
combined.head(20)

Unnamed: 0,country,category,year,import_value,export_value,mfn_by_us_simple_avg,mfn_by_us_weighted_avg,mfn_on_us_simple_avg,mfn_on_us_weighted_avg,gdp,gdp_2015_adj
0,Afghanistan,Parts,2008,10294,26954442,3.31,1.64,7.68,5.48,10109297047.0,11060395115.0
1,Afghanistan,Parts,2009,161658,102667946,2.83,1.42,7.911,5.715,12416152732.0,13426272073.0
2,Afghanistan,Parts,2010,26552,87621159,2.44,1.15,8.142,5.95,15856668555.0,15354612541.0
3,Afghanistan,Parts,2011,5928,51232064,3.82,1.36,8.373,6.185,17805098206.0,15420077665.0
4,Afghanistan,Parts,2012,2500,84963182,3.69,1.52,8.604,6.42,19907329777.0,17386490239.0
5,Afghanistan,Parts,2013,190303,34468998,4.11,0.99,8.835,6.655,20146416757.0,18360263162.0
6,Afghanistan,Parts,2014,116510,23578362,3.58,1.48,9.066,6.89,20497128555.0,18860496494.0
7,Afghanistan,Parts,2015,86487,47856294,3.46,2.9,9.297,7.125,19134221644.0,19134221644.0
8,Afghanistan,Parts,2016,56399,25713675,3.95,2.02,9.528,7.36,18116572395.0,19566715174.0
9,Afghanistan,Parts,2017,86085,20570029,2.75,1.13,9.759,7.595,18753456497.0,20084646751.0


In [18]:
# save the combined data to csv
combined.to_csv('./../../data/processed/combined_primary_secondary.csv', index=False)