# EGEDA cleaning script

### For cleaning the EGEDA data sent by Edito: EGEDA_2018.xlsx

In [3]:
import numpy as np
import pandas as pd
from collections import OrderedDict
import re

ModuleNotFoundError: No module named 'numpy'

In [None]:
# read raw data

RawEGEDA = pd.read_excel('../../data/EGEDA_2018.xlsx',
                         sheet_name = None,
                         na_values = ['x', 'X', '']) # I don't think there's any x's or X's in the EGEDA xlsx file, but leaving as is (shouldn't make a difference)

In [None]:
RawEGEDA.keys()

In [None]:
years = list(range(1980, 2019, 1))

In [None]:
RawEGEDA['20_USA'].head(2)

In [None]:
RawEGEDA['20_USA']['Item Code'].unique()

In [None]:
df_list = []

economies = RawEGEDA.keys()

for economy in economies:
    _df_economy = RawEGEDA[economy]
    _df = pd.melt(_df_economy, 
                  id_vars = ['Product Code', 'Item Code'], 
                  value_vars = years, 
                  var_name = 'year',
                  value_name = 'value'
                )
    #_df = _df.pivot_table(index=['Year','Product Code'],columns='Item Code',values='Value')
    _df['economy'] = economy
    _df = _df.set_index(['economy', 'year'])
    df_list.append(_df)

df = pd.concat(df_list)    

In [None]:
df

In [None]:
df[df['Product Code'] == '1.1 Coking coal'].head(2)

In [None]:
df.columns = [c.replace(' ', '_') for c in df.columns]
df.columns = map(str.lower, df.columns)

# And remove multiple spaces from variables
df['product_code'] = df['product_code'].replace('\s+', ' ', regex = True)
df['item_code'] = df['item_code'].replace('\s+', ' ', regex = True)

#### Change product code and item code names

In [None]:
df['fuel_code'] = df['product_code']
df['item_code_new'] = df['item_code']

# new fuel_code
df['fuel_code'] = df['fuel_code'].str.lower()
df['fuel_code'] = df['fuel_code'].str.replace(' ', '_').str.replace('.', '_').str.replace('/', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '') \
.str.replace(',', '').str.replace('&', 'and').str.rstrip('_')

# item_code_new
df['item_code_new'] = df['item_code_new'].str.lower()
df['item_code_new'] = df['item_code_new'].str.replace(' ', '_').str.replace('.', '_').str.replace('/', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '') \
.str.replace(',', '').str.replace('&', 'and').str.rstrip('_')

In [None]:
list(df.loc[:,'fuel_code'].unique())

#### Create dictionary of EGEDA Product Codes and APERC Fuel codes
(No longer used)

In [None]:
df[df.fuel_code == '1_1_coking_coal'].head() # spot check aligns with EGEDA raw

In [None]:
# Remove duplicate columns

df = df[['fuel_code', 'item_code_new', 'value']]

In [None]:
df[df.fuel_code == '1_1_coking_coal'].head(5)

In [None]:
# Input thermal coal variable/subtotal

thermal_df = df[df['fuel_code'].isin(['1_2_other_bituminous_coal', '1_3_subbituminous_coal', '1_4_anthracite', '3_peat','4_peat_products'])]
assert thermal_df.value.isna().sum() == 0

df1 = thermal_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '1_x_coal_thermal').set_index(['economy', 'year']).append(df)

df1

In [None]:
df1[df1['fuel_code'] == '1_3_subbituminous_coal']

In [None]:
df1[df1['fuel_code'] == '1_x_coal_thermal']

In [None]:
# And also insert NGL aggregate variable

NGL_df = df1[df1['fuel_code'].isin(['6_2_natural_gas_liquids', '6_3_refinery_feedstocks', '6_4_additives_oxygenates', '6_5_other_hydrocarbons'])]

assert NGL_df.value.isna().sum() == 0

df2 = NGL_df.groupby(['economy', 'year', 'item_code_new'])['value'].sum().reset_index().assign(fuel_code = '6_x_ngls').set_index(['economy', 'year']).append(df1)

df2

In [None]:
df2.loc[:,'item_code_new'].unique()

In [None]:
df2

In [None]:
# Before changing ktoe to PJ, remove rows with data in Gwh

gwh_to_pj_df = df2[df2['item_code_new'] == '18__electricity_output_in_gwh']

gwh_to_pj_conversion = 0.0036

# electricity gwh data changed to pj
 
gwh_to_pj_df = gwh_to_pj_df.assign(pj = np.multiply(gwh_to_pj_df['value'], gwh_to_pj_conversion))  
gwh_to_pj_df['item_code_new'] = '18__electricity_output_in_gwh'

gwh_to_pj_df.columns = ['item_code_new', 'ktoe', 'fuel_code', 'pj']
gwh_to_pj_df = gwh_to_pj_df[['fuel_code', 'item_code_new', 'ktoe', 'pj']]

In [None]:
gwh_to_pj_df

#### Add in PJ columns

In [None]:
# Conversion to PJ

conversion_to_PJ = 1 # 41.868 PJ = 1 million tonnes of oil equivalent
# http://w.astro.berkeley.edu/~wright/fuel_energy.html

df_pj = df2[df2['item_code_new'] != '18__electricity_output_in_gwh']

df_pj = df_pj.assign(pj = np.multiply(df_pj['value'], conversion_to_PJ))
df_pj.columns = ['item_code_new', 'ktoe', 'fuel_code', 'pj']
df_pj = df_pj[['fuel_code', 'item_code_new', 'ktoe', 'pj']]

In [None]:
df_pj.head()

In [None]:
# Now append df_pj to gwh_pj_df (so all data is now in PJ)

df = df_pj.append(gwh_to_pj_df)

In [None]:
df

In [None]:
df[df['fuel_code'] == '1_3_subbituminous_coal']

In [None]:
df.head()

In [None]:
df_tidy = df.reset_index()

In [None]:
df_tidy[df_tidy['fuel_code'] == '1_3_subbituminous_coal']

In [None]:
# Load correct order of fuel code and item code. Update this csv based on new entries or desired order

ordered = pd.read_csv('../../data/order_2018.csv')
ordered

In [None]:
list(ordered['fuel_code'].unique())[:-1]

#### Reorder fuel code and item code

In [None]:
# This grabs the unique values of fuel_code and item_code_new in the order they appear in the original dataframe. It removes 'na' by calling '[:-1]' 

order1 = list(ordered['fuel_code'].unique())[:-1]
order2 = list(ordered['item_code_new'])

# Take order defined above and define each of the variables as categorical in that already established order (for the benefit of viewing data later)

df_tidy['fuel_code'] = pd.Categorical(df_tidy['fuel_code'], 
                                      categories = order1, 
                                      ordered = True)

df_tidy['item_code_new'] = pd.Categorical(df_tidy['item_code_new'],
                                          categories = order2,
                                          ordered = True)

In [None]:
df_tidy_sorted = df_tidy.sort_values(['fuel_code', 'item_code_new']).reset_index()
df_tidy_sorted[df_tidy_sorted['fuel_code'] == '1_3_subbituminous_coal'] #1_1_3_subbituminous_coal

#### Drop ktoe column and save as tidy data set

In [None]:
#df_tidy_sorted = df_tidy_sorted.drop(['index', 'ktoe'], axis = 1)
#df_tidy_sorted.to_csv("../../results/EGEDA_2018_tidy.csv", index = False)

In [None]:
#df_tidy_sorted[df_tidy_sorted['fuel_code'] == '1_x_coal_thermal'].head()

In [None]:
# View df

df_tidy_sorted.head(2)

#### Now, pivot the tidy dataset to provide it in wide format similar to RawEGEDA (so years are across the top)

In [None]:
df_years = df_tidy_sorted.pivot_table(index = ['economy', 'fuel_code', 'item_code_new'], columns = 'year', values = 'pj').reset_index(drop = False)

In [None]:
df_years.head()

In [None]:
#df_years.to_csv("../../results/EGEDA_2018_years.csv", index = False)

In [None]:
df_years.to_excel("../../results/EGEDA_2018_years.xlsx", index = False)

#### And now pivot so item codes are along the top

In [None]:
df_items = df_tidy_sorted
df_items['item_code_new'] = df_items['item_code_new'].astype(str)

df_items = df_items.pivot_table(index = ['economy', 'fuel_code', 'year'], columns = 'item_code_new', values = 'pj').reset_index()

# Reorder columns based on order2 defined above

NewOrder = ['economy', 'fuel_code', 'year']
NewOrder.extend(order2) 

df_items = df_items[NewOrder]

In [None]:
df_items.head()

In [None]:
#df_items.to_csv("../../results/EGEDA_2018_items.csv", index = True)

In [None]:
df_items.to_excel("../../results/EGEDA_2018_items.xlsx", index = False)