This file is used to clean and merge Eora dataset.

Eora dataset is well structured. However, we need to reconstruct the data since we define our own country list (to be consistent with country set in CEPII and Tariff). Besides, we also need to extendt the RoW in original Eora to multiple sectors.

Need to check:

- Country list

- Sector list

- Estimation Method

# Setup

In [1]:
# Load required libraries
import os
import pandas as pd 
import numpy as np

%load_ext autoreload
%autoreload 2  

from Functions import read_files, check_consistency, IO_country_merge_function, IO_sector_merge_function, IO_sector_remove_function, FD_country_merge_function, FD_sector_merge_function, FD_sector_remove_function, VA_country_merge_function, VA_sector_merge_function, VA_sector_remove_function

In [2]:
# Set working directory
wd = os.path.expanduser("~/Dropbox/Tariff_Project")
os.chdir(wd)
print(f"Current working directory: {os.getcwd()}")

# 3. Define data paths and parameters
# Path to raw Eora BP data
data_path = "2_Data"
bp_path = os.path.join(data_path, "Eora26_bp")

# This creates a list of years from 1995 to 2017 (inclusive). Note that the upper bound is exlucded
# years = list(range(1995, 2018))  

# If we only do with one year
# But all the codes in this file can be used to deal with multiple years, since all variables are in dict type
years = [2017]

# Verify paths
print(f"Eora BP data folder: {bp_path}")

Current working directory: /Users/lishuangcen/Dropbox/Tariff_Project
Eora BP data folder: 2_Data/Eora26_bp


# Check Consistency and Difine My Country List

In [3]:
# Check consistency:  we need to make sure if the country list and sector list is the same within the sample years
# Import labels: label_T is the country * sector files
label_T = read_files(bp_path, years, "labels_T.txt")

consistency_df, country_list, sector_list = check_consistency(label_T)

# Preview consistency results
print(consistency_df)

# country_list and sector_list contain reference codes
print(country_list)
print(f"Number of countries: {len(country_list)}")
print(sector_list)
print(f"Number of sectors: {len(sector_list)}")


   Year  Country_Consistent  Sector_Consistent
0  2017                True               True
['AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COG', 'CRI', 'HRV', 'CUB', 'CYP', 'CZE', 'CIV', 'PRK', 'COD', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'PYF', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRL', 'GTM', 'GIN', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MRT', 'MUS', 'MEX', 'MCO', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'ANT', 'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NOR', 'PSE', 'OMN', 'P

In [4]:
# Decide My Country List
# You can switch between using tariff intersection or a manual list via the `use_tariff_intersection` flag.

# Flag to toggle method
# this make easy to choose method 1 or method 2 to define country list; now I use method 2
use_tariff_intersection = False 

if use_tariff_intersection:
    # Method 1: derive from tariff data intersection
    tariff_df = pd.read_csv(os.path.join(data_path, "sectoral_tariff_2017.csv"))
    im_countries_tariff = tariff_df['Importer'].unique().tolist()
    ex_countries_tariff = tariff_df['Exporter'].unique().tolist()
    # Ensure importer and exporter lists match
    if set(im_countries_tariff) != set(ex_countries_tariff):
        print("Warning: Importer and exporter country lists differ.")
    # Intersect with Eora country_list (reference)
    my_country_list = [c for c in country_list if c in im_countries_tariff]
else:
    # Method 2: manual definition
    my_country_list = [
        "USA", "JPN", "DEU", "FRA", "GBR", "ITA", "BRA", "CHN", "ESP", "CAN",
        "KOR", "NLD", "AUS", "IND", "RUS", "MEX", "BEL", "SWE", "TUR",
        "AUT", "DNK", "POL", "GRC", "FIN", "PRT", "IRL", "CZE", "HUN", "ROU",
        "SVK", "SVN", "TWN", "BGR", "LTU", "EST", "VNM"
    ]

# Remove "ROW" if present
my_country_list = [c for c in my_country_list if c != "ROW"]

# Generate codes mapping relative to country_list
code_my_country = [country_list.index(c) + 1 for c in my_country_list if c in country_list]
code_row = [i for i in range(1, len(country_list)+1) if i not in code_my_country]

# Summary
print(f"Using tariff intersection? {use_tariff_intersection}")
print(f"Number of selected countries: {len(my_country_list)}")


Using tariff intersection? False
Number of selected countries: 36


In [5]:
N_old = len(country_list) - 1  # except ROW
J_old = len(sector_list) - 1  # except "TOTAL" for ROW

# ROW has index 190
code_row = [code for code in code_row if code != 190]

print(N_old)
print(J_old)

189
26


# Manipulate IO Tables (T)

In [6]:
# Import data
file_data_T_bp = "Eora26_XXXX_bp_T.txt"
bp_data_T = read_files(bp_path, years, file_data_T_bp)

# Convert DataFrame to NumPy matrix, replacing NaNs with 0
bp_matrix_T = {
    year: df.fillna(0).values.astype(float)
    for year, df in bp_data_T.items()
}

print(type(bp_matrix_T))

<class 'dict'>


In [7]:
T_country_merged_list = {
    year: IO_country_merge_function(
        io_matrix=bp_matrix_T[year],
        N=N_old,
        J=J_old,
        code_my_country=code_my_country,
        code_row=code_row
    ) for year in bp_matrix_T
}

In [8]:
J_new = J_old
N_new = len(my_country_list) + 1  # to include ROW

In [9]:
# Sector 26 (Re-export and Re-import) corresponds to index 25 (0-based indexing)
sector_to_remove = 25

T_final = {
    year: IO_sector_remove_function(
        io_matrix=matrix,
        N=N_new,
        J=J_new,
        sector_to_remove=sector_to_remove
    ) 
    for year, matrix in T_country_merged_list.items()
}


In [10]:
N_final = N_new
J_final = J_new -1

In [11]:
# check if the dimensions are correct
print(f"The dimension should be: {N_final * J_final} * {N_final * J_final} ")
{year: matrix.shape for year, matrix in T_final.items()}


The dimension should be: 925 * 925 


{'2017': (925, 925)}

# Manipulate Final Demand Tables (FD)

In [12]:
# Import data

file_data_FD_bp = "Eora26_XXXX_bp_FD.txt"
bp_data_FD = read_files(bp_path, years, file_data_FD_bp)

# Convert to matrix and fill missing values
bp_matrix_FD = {
    year: df.fillna(0).values.astype(float)
    for year, df in bp_data_FD.items()
}

# Check if all matrices have expected 6 final demand accounts (columns)
FD_shapes = {year: matrix.shape for year, matrix in bp_matrix_FD.items()}
print("Final Demand matrix shapes:")
for year, shape in FD_shapes.items():
    print(f"Year {year}: shape = {shape}")

# Final demand accounts count (from Eora documentation)
FD_num = 6

Final Demand matrix shapes:
Year 2017: shape = (4915, 1140)


In [13]:
FD_country_merged_list = {
    year: FD_country_merge_function(
        fd_matrix=matrix,
        N=N_old,
        J=J_old,
        FD_num=FD_num,
        code_my_country=code_my_country,
        code_row=code_row,
        index_ROW=190  # or whatever `ROW` was defined as
    )
    for year, matrix in bp_matrix_FD.items()
}


In [14]:
FD_final = {
    year: FD_sector_remove_function(
        fd_matrix=matrix,
        N=N_new,
        J=J_new,  
        sector_to_remove = 25 
    )
    for year, matrix in FD_country_merged_list.items()
}



In [15]:
# check if the dimensions are correct
print(f"The dimension should be: {N_final * J_final} * {N_final * FD_num}")
{year: matrix.shape for year, matrix in FD_final.items()}

The dimension should be: 925 * 222


{'2017': (925, 222)}

# Manipulate Value Added Tables (VA)

In [16]:
file_data_VA_bp = "Eora26_XXXX_bp_VA.txt"
bp_data_VA = read_files(bp_path, years, file_data_VA_bp)

bp_matrix_VA = {
    year: df.fillna(0).values.astype(float)
    for year, df in bp_data_VA.items()
}

VA_num = 6


In [17]:
VA_country_merged_list = {
    year: VA_country_merge_function(
        va_matrix=matrix,
        N=N_old,
        J=J_old,
        VA_num=VA_num,
        code_my_country=code_my_country,
        code_row=code_row
    )
    for year, matrix in bp_matrix_VA.items()
}


In [18]:
VA_final = {
    year: VA_sector_remove_function(
        va_matrix=matrix,
        N=N_new,
        J=J_new,
        sector_to_remove=25  
    )
    for year, matrix in VA_country_merged_list.items()
}


In [19]:
# check if the dimensions are correct
print(f"The dimension should be: {VA_num} * {N_final * J_final}")
{year: matrix.shape for year, matrix in VA_final.items()}

The dimension should be: 6 * 925


{'2017': (6, 925)}

# Save Results

In [20]:
# Define country list with ROW at the end
row_country_list = [c for c in country_list if c not in my_country_list]
remaining_country_list = [c for c in country_list if c not in row_country_list]
all_country = remaining_country_list + ["ROW"]


all_sector = sector_list[:-2]


# Generate label DataFrame (country-sector pairs)
labels = pd.DataFrame({
    "Country": sum([[c]*len(all_sector) for c in all_country], []),
    "Sector": all_sector * len(all_country)
})

# Create save path
save_path = "3_Result/eora_clean/"
os.makedirs(save_path, exist_ok=True)

# Save label-related CSVs
labels.to_csv(os.path.join(save_path, "labels.csv"))
pd.Series(all_country).to_csv(os.path.join(save_path, "country_list.csv"), index=False)
pd.Series(all_sector).to_csv(os.path.join(save_path, "sector_list.csv"), index=False)


In [21]:
for year in years:
    y_str = str(year)
    filename_T = os.path.join(save_path, f"T_final_{y_str}.csv")
    filename_FD = os.path.join(save_path, f"FD_final_{y_str}.csv")
    filename_VA = os.path.join(save_path, f"VA_final_{y_str}.csv")

    pd.DataFrame(T_final[y_str]).to_csv(filename_T, index=False)
    pd.DataFrame(FD_final[y_str]).to_csv(filename_FD, index=False)
    pd.DataFrame(VA_final[y_str]).to_csv(filename_VA, index=False)



# Check

This part is used to create tne original IO talbe with label, to check if the merging work above is correct or not

In [22]:
# IO table with labels
year0 = next(iter(bp_matrix_T))        
matrix = bp_matrix_T[year0]  
labels_df = label_T[year0]

labels = labels_df.iloc[:, [0, 3]].astype(str).agg('_'.join, axis=1).tolist()

assert matrix.shape[0] == len(labels), \
    f"number of labels ({len(labels)}) not equal matrix dimensions ({matrix.shape[0]})"


df = pd.DataFrame(matrix,
                index=labels,
                columns=labels)


df.to_csv(os.path.join(wd, "check/IO_with_labels.csv"), index=True, encoding="utf-8")


In [23]:
# FD table with labels
label_FD = read_files(bp_path, years, "labels_FD.txt")

matrix = bp_matrix_FD[year0]
labels_fd_df = label_FD[year0]

labels_fd = labels_fd_df.iloc[:, [0, 3]].astype(str).agg('_'.join, axis=1).tolist()

df = pd.DataFrame(matrix,
                index=labels,
                columns=labels_fd)


df.to_csv(os.path.join(wd, "check/FD_with_labels.csv"), index=True, encoding="utf-8")

In [24]:
expanded = [c for c in all_country  for _ in range(6)]
df_labels = pd.DataFrame({"country": expanded})
df_labels.to_csv(os.path.join(wd, "check/fd_labels.csv"), index=False, encoding="utf-8")

In [25]:
# VA table with labels

label_VA = read_files(bp_path, years, "labels_VA.txt")

matrix = bp_matrix_VA[year0]
labels_va_df = label_VA[year0]

labels_va = labels_va_df.iloc[:, [1]].astype(str).agg('_'.join, axis=1).tolist()


df = pd.DataFrame(matrix,
                index=labels_va,
                columns=labels)


df.to_csv(os.path.join(wd, "check/VA_with_labels.csv"), index=True, encoding="utf-8")
