# EGEDA cleaning script

### For cleaning the EGEDA data sent by Edito: EGEDA_2018.xlsx

In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict
import re

In [2]:
# read raw data

RawEGEDA = pd.read_excel('../../data/00APEC Total_15Jan2021.xlsx',
                         sheet_name = None,
                         na_values = ['x', 'X', '']) # I don't think there's any x's or X's in the EGEDA xlsx file, but leaving as is (shouldn't make a difference)

In [3]:
RawEGEDA.keys()

dict_keys(['APEC', '01_AUS', '02_BD', '03_CDA', '04_CHL', '05_PRC', '06_HKC', '07_INA', '08_JPN', '09_ROK', '10_MAS', '11_MEX', '12_NZ', '13_PNG', '14_PE', '15_RP', '16_RUS', '17_SIN', '18_CT', '19_THA', '20_USA', '21_VN', '22_SEA', '23_NEA', '24_OAM', '25_OCE', '23b_ONEA', '24b_OOAM'])

In [4]:
years = list(range(1980, 2019, 1))

In [5]:
df_list = []

economies = RawEGEDA.keys()

for economy in economies:
    _df_economy = RawEGEDA[economy]
    _df = pd.melt(_df_economy, 
                  id_vars = ['Product Code', 'Item Code'], 
                  value_vars = years, 
                  var_name = 'year',
                  value_name = 'value'
                )
    #_df = _df.pivot_table(index=['Year','Product Code'],columns='Item Code',values='Value')
    _df['economy'] = economy
    _df = _df.set_index(['economy', 'year'])
    df_list.append(_df)

df = pd.concat(df_list)    

In [6]:
df.columns = [c.replace(' ', '_') for c in df.columns]
df.columns = map(str.lower, df.columns)

# And remove multiple spaces from variables
df['product_code'] = df['product_code'].replace('\s+', ' ', regex = True)
df['item_code'] = df['item_code'].replace('\s+', ' ', regex = True)

## Change product code and item code names

In [7]:
df['fuel_code'] = df['product_code']
df['item_code_new'] = df['item_code']

# new fuel_code
df['fuel_code'] = df['fuel_code'].str.lower()
df['fuel_code'] = df['fuel_code'].str.replace(' ', '_').str.replace('.', '_').str.replace('/', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '') \
.str.replace(',', '').str.replace('&', 'and').str.replace('__', '_').str.replace(':', '').str.rstrip('_')

# item_code_new
df['item_code_new'] = df['item_code_new'].str.lower()
df['item_code_new'] = df['item_code_new'].str.replace(' ', '_').str.replace('.', '_').str.replace('/', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '') \
.str.replace(',', '').str.replace('&', 'and').str.replace('__', '_').str.rstrip('_')

In [8]:
# list(df.loc[:,'fuel_code'].unique())

In [9]:
# Remove duplicate columns

df = df[['fuel_code', 'item_code_new', 'value']]

In [10]:
# Input thermal coal variable/subtotal

thermal_df = df[df['fuel_code'].isin(['1_2_other_bituminous_coal', '1_3_subbituminous_coal', '1_4_anthracite', '3_peat', '4_peat_products'])]
assert thermal_df.value.isna().sum() == 0

df1 = thermal_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '1_x_coal_thermal').set_index(['economy', 'year']).append(df)

In [11]:
# And also insert NGL aggregate variable

NGL_df = df1[df1['fuel_code'].isin(['6_2_natural_gas_liquids', '6_3_refinery_feedstocks', '6_4_additives_oxygenates', '6_5_other_hydrocarbons'])]

assert NGL_df.value.isna().sum() == 0

df2 = NGL_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '6_x_ngls').set_index(['economy', 'year']).append(df1)

In [12]:
# And also insert NGL aggregate variable

otherpet_df = df2[df2['fuel_code'].isin(['7_12_white_spirit_sbp', '7_13_lubricants', '7_14_bitumen', '7_15_paraffin_waxes', '7_16_petroleum_coke', '7_17_other_products'])]

assert otherpet_df.value.isna().sum() == 0

df3 = otherpet_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '7_x_other_petroleum_products').set_index(['economy', 'year']).append(df2)

In [13]:
# Now jet fuel aggregate variable

jetfuel_df = df3[df3['fuel_code'].isin(['7_4_gasoline_type_jet_fuel', '7_5_kerosene_type_jet_fuel'])]

assert jetfuel_df.value.isna().sum() == 0

df4 = jetfuel_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '7_x_jet_fuel').set_index(['economy', 'year']).append(df3)

In [14]:
# # New solar difference variable

# new_solar_df = df4[df4['fuel_code'].isin(['12_solar', '12_1_of_which_photovoltaics'])]

# assert new_solar_df.value.isna().sum() == 0

# df5 = new_solar_df.groupby(['economy', 'year', 'item_code_new'])['value'].subtract().reset_index().assign(fuel_code = '12_x_non_pv_solar').set_index(['economy', 'year']).append(df4)

In [15]:
# Before changing ktoe to PJ, remove rows with data in Gwh

gwh_to_pj_df = df4[df4['item_code_new'] == '18_electricity_output_in_gwh']
everything_else_df = df4[df4['item_code_new'] != '18_electricity_output_in_gwh']

gwh_to_pj_conversion = 0.0036

# electricity gwh data changed to pj
 
gwh_to_pj_df = gwh_to_pj_df.assign(pj = np.multiply(gwh_to_pj_df['value'], gwh_to_pj_conversion))  
gwh_to_pj_df['item_code_new'] = '18_electricity_output_in_pj'

gwh_to_pj_df.columns = ['item_code_new', 'value', 'fuel_code', 'pj']
gwh_to_pj_df = gwh_to_pj_df[['fuel_code', 'item_code_new', 'pj']]

# Change value column (PJs) to 'pj' same as newly created column in dataframe above that converts ktoe to pj
everything_else_df.columns = ['item_code_new', 'pj', 'fuel_code']
everything_else_df = everything_else_df[['fuel_code', 'item_code_new', 'pj']]

In [16]:
# Now append df_pj to gwh_pj_df (so all data is now in PJ)

df = everything_else_df.append(gwh_to_pj_df)

In [17]:
df_tidy = df.reset_index()

In [34]:
# Load correct order of fuel code and item code. Update this csv based on new entries or desired order

ordered = pd.read_csv('../../data/order_2018_egeda.csv')
# ordered

## Reorder fuel code and item code

In [35]:
# This grabs the unique values of fuel_code and item_code_new in the order they appear in the original dataframe. It removes 'na' by calling '[:-1]' 

order1 = list(ordered['fuel_code'].unique())[:-1]
order2 = list(ordered['item_code_new'])

# Take order defined above and define each of the variables as categorical in that already established order (for the benefit of viewing data later)

df_tidy['fuel_code'] = pd.Categorical(df_tidy['fuel_code'], 
                                      categories = order1, 
                                      ordered = True)

df_tidy['item_code_new'] = pd.Categorical(df_tidy['item_code_new'],
                                          categories = order2,
                                          ordered = True)

In [36]:
df_tidy_sorted = df_tidy.sort_values(['fuel_code', 'item_code_new']).reset_index(drop = True)
# df_tidy_sorted[df_tidy_sorted['fuel_code'] == '1_3_subbituminous_coal'] #1_1_3_subbituminous_coal

### Now, pivot the tidy dataset to provide it in wide format similar to RawEGEDA (so years are across the top)

In [37]:
df_years = df_tidy_sorted.pivot_table(index = ['economy', 'fuel_code', 'item_code_new'], columns = 'year', values = 'pj').reset_index(drop = False)

In [38]:
df_years.to_csv("../../results/EGEDA_2018_years.csv", index = False)

In [39]:
# df_years.to_excel("../../results/EGEDA_2018_years.xlsx", index = False)

## And now pivot so item codes are along the top

In [40]:
df_items = df_tidy_sorted
df_items['item_code_new'] = df_items['item_code_new'].astype(str)

df_items = df_items.pivot_table(index = ['economy', 'fuel_code', 'year'], columns = 'item_code_new', values = 'pj').reset_index()

# Reorder columns based on order2 defined above

NewOrder = ['economy', 'fuel_code', 'year']
NewOrder.extend(order2) 

df_items = df_items[NewOrder]

In [41]:
df_items.to_csv("../../results/EGEDA_2018_items.csv", index = False)

In [42]:
# df_items.to_excel("../../results/EGEDA_2018_items.xlsx", index = False)