# EGEDA cleaning script

### For cleaning the EGEDA data sent by Edito: EGEDA_2018.xlsx

In [89]:
import numpy as np
import pandas as pd
from collections import OrderedDict
import re

In [90]:
# read raw data

RawEGEDA = pd.read_excel('../../data/EGEDA_2018.xlsx',
                         sheet_name = None,
                         na_values = ['x', 'X', '']) # I don't think there's any x's or X's in the EGEDA xlsx file, but leaving as is (shouldn't make a difference)

In [91]:
RawEGEDA.keys()

dict_keys(['APEC', '01_AUS', '02_BD', '03_CDA', '04_CHL', '05_PRC', '06_HKC', '07_INA', '08_JPN', '09_ROK', '10_MAS', '11_MEX', '12_NZ', '13_PNG', '14_PE', '15_RP', '16_RUS', '17_SIN', '18_CT', '19_THA', '20_USA', '21_VN', '22_SEA', '23_NEA', '24_OAM', '25_OCE', '23b_ONEA', '24b_OOAM'])

In [92]:
years = list(range(1980, 2019, 1))

In [93]:
df_list = []

economies = RawEGEDA.keys()

for economy in economies:
    _df_economy = RawEGEDA[economy]
    _df = pd.melt(_df_economy, 
                  id_vars = ['Product Code', 'Item Code'], 
                  value_vars = years, 
                  var_name = 'year',
                  value_name = 'value'
                )
    #_df = _df.pivot_table(index=['Year','Product Code'],columns='Item Code',values='Value')
    _df['economy'] = economy
    _df = _df.set_index(['economy', 'year'])
    df_list.append(_df)

df = pd.concat(df_list)    

In [94]:
df.columns = [c.replace(' ', '_') for c in df.columns]
df.columns = map(str.lower, df.columns)

# And remove multiple spaces from variables
df['product_code'] = df['product_code'].replace('\s+', ' ', regex = True)
df['item_code'] = df['item_code'].replace('\s+', ' ', regex = True)

## Change product code and item code names

In [95]:
df['fuel_code'] = df['product_code']
df['item_code_new'] = df['item_code']

# new fuel_code
df['fuel_code'] = df['fuel_code'].str.lower()
df['fuel_code'] = df['fuel_code'].str.replace(' ', '_').str.replace('.', '_').str.replace('/', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '') \
.str.replace(',', '').str.replace('&', 'and').str.replace('__', '_').str.replace(':', '').str.rstrip('_')

# item_code_new
df['item_code_new'] = df['item_code_new'].str.lower()
df['item_code_new'] = df['item_code_new'].str.replace(' ', '_').str.replace('.', '_').str.replace('/', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '') \
.str.replace(',', '').str.replace('&', 'and').str.replace('__', '_').str.rstrip('_')

In [96]:
# list(df.loc[:,'fuel_code'].unique())

In [97]:
# Remove duplicate columns

df = df[['fuel_code', 'item_code_new', 'value']]

In [99]:
# Input thermal coal variable/subtotal

thermal_df = df[df['fuel_code'].isin(['1_2_other_bituminous_coal', '1_3_subbituminous_coal', '1_4_anthracite', '3_peat','4_peat_products'])]
assert thermal_df.value.isna().sum() == 0

df1 = thermal_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '1_x_coal_thermal').set_index(['economy', 'year']).append(df)

In [100]:
# And also insert NGL aggregate variable

NGL_df = df1[df1['fuel_code'].isin(['6_2_natural_gas_liquids', '6_3_refinery_feedstocks', '6_4_additives_oxygenates', '6_5_other_hydrocarbons'])]

assert NGL_df.value.isna().sum() == 0

df2 = NGL_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '6_x_ngls').set_index(['economy', 'year']).append(df1)

In [None]:
# And also insert NGL aggregate variable

otherpet_df = df2[df2['fuel_code'].isin(['7_12_white_spirit_sbp', '7_13_lubricants', '7_14_bitumen', '7_15_paraffin_waxes', '7_16_petroleum_coke', '7_17_other_products'])]

assert otherpet_df.value.isna().sum() == 0

df3 = otherpet_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '7_x_other_petroleum_products').set_index(['economy', 'year']).append(df2)

In [101]:
# Before changing ktoe to PJ, remove rows with data in Gwh

gwh_to_pj_df = df3[df3['item_code_new'] == '18_electricity_output_in_gwh']
everything_else_df = df3[df3['item_code_new'] != '18_electricity_output_in_gwh']

gwh_to_pj_conversion = 0.0036

# electricity gwh data changed to pj
 
gwh_to_pj_df = gwh_to_pj_df.assign(pj = np.multiply(gwh_to_pj_df['value'], gwh_to_pj_conversion))  
gwh_to_pj_df['item_code_new'] = '18_electricity_output_in_pj'

gwh_to_pj_df.columns = ['item_code_new', 'value', 'fuel_code', 'pj']
gwh_to_pj_df = gwh_to_pj_df[['fuel_code', 'item_code_new', 'pj']]

# Change value column (PJs) to 'pj' same as newly created column in dataframe above that converts ktoe to pj
everything_else_df.columns = ['item_code_new', 'pj', 'fuel_code']
everything_else_df = everything_else_df[['fuel_code', 'item_code_new', 'pj']]

In [102]:
# Now append df_pj to gwh_pj_df (so all data is now in PJ)

df = everything_else_df.append(gwh_to_pj_df)

In [103]:
df_tidy = df.reset_index()

In [104]:
# Load correct order of fuel code and item code. Update this csv based on new entries or desired order

ordered = pd.read_csv('../../data/order_2018.csv')
# ordered

Unnamed: 0,fuel_code,item_code_new
0,1_coal,1_indigenous_production
1,1_x_coal_thermal,2_imports
2,1_1_coking_coal,3_exports
3,1_2_other_bituminous_coal,4_international_marine_bunkers
4,1_3_subbituminous_coal,5_international_aviation_bunkers
...,...,...
90,,17_2_industry_sector
91,,17_3_transport_sector
92,,17_4_other_sector
93,,18_electricity_output_in_pj


## Reorder fuel code and item code

In [105]:
# This grabs the unique values of fuel_code and item_code_new in the order they appear in the original dataframe. It removes 'na' by calling '[:-1]' 

order1 = list(ordered['fuel_code'].unique())[:-1]
order2 = list(ordered['item_code_new'])

# Take order defined above and define each of the variables as categorical in that already established order (for the benefit of viewing data later)

df_tidy['fuel_code'] = pd.Categorical(df_tidy['fuel_code'], 
                                      categories = order1, 
                                      ordered = True)

df_tidy['item_code_new'] = pd.Categorical(df_tidy['item_code_new'],
                                          categories = order2,
                                          ordered = True)

In [106]:
df_tidy_sorted = df_tidy.sort_values(['fuel_code', 'item_code_new']).reset_index(drop = True)
# df_tidy_sorted[df_tidy_sorted['fuel_code'] == '1_3_subbituminous_coal'] #1_1_3_subbituminous_coal

Unnamed: 0,economy,year,fuel_code,item_code_new,pj
419328,APEC,1980,1_3_subbituminous_coal,1_indigenous_production,3161.120541
419329,APEC,1981,1_3_subbituminous_coal,1_indigenous_production,3266.929560
419330,APEC,1982,1_3_subbituminous_coal,1_indigenous_production,3318.013461
419331,APEC,1983,1_3_subbituminous_coal,1_indigenous_production,3214.534228
419332,APEC,1984,1_3_subbituminous_coal,1_indigenous_production,3736.259582
...,...,...,...,...,...
524155,24b_OOAM,2014,1_3_subbituminous_coal,,-226.149709
524156,24b_OOAM,2015,1_3_subbituminous_coal,,-234.157843
524157,24b_OOAM,2016,1_3_subbituminous_coal,,-226.787777
524158,24b_OOAM,2017,1_3_subbituminous_coal,,-157.520144


### Now, pivot the tidy dataset to provide it in wide format similar to RawEGEDA (so years are across the top)

In [107]:
df_years = df_tidy_sorted.pivot_table(index = ['economy', 'fuel_code', 'item_code_new'], columns = 'year', values = 'pj').reset_index(drop = False)

In [108]:
df_years.to_csv("../../results/EGEDA_2018_years.csv", index = False)

In [109]:
# df_years.to_excel("../../results/EGEDA_2018_years.xlsx", index = False)

## And now pivot so item codes are along the top

In [110]:
df_items = df_tidy_sorted
df_items['item_code_new'] = df_items['item_code_new'].astype(str)

df_items = df_items.pivot_table(index = ['economy', 'fuel_code', 'year'], columns = 'item_code_new', values = 'pj').reset_index()

# Reorder columns based on order2 defined above

NewOrder = ['economy', 'fuel_code', 'year']
NewOrder.extend(order2) 

df_items = df_items[NewOrder]

In [111]:
df_items.to_csv("../../results/EGEDA_2018_items.csv", index = False)

In [112]:
# df_items.to_excel("../../results/EGEDA_2018_items.xlsx", index = False)