# Data processing for FlexSUS

institution: DTU

author: [tilseb](mailto:tilseb@dtu.dk)

date created: 2020-01-30

licensed under: [GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007](https://www.gnu.org/licenses/gpl-3.0.html)

## Installation

1. Installation via miniconda. Get miniconda with Python 3.7 [here](https://docs.conda.io/en/latest/miniconda.html).
2. Open the anaconda promt: press `start button` , type `anaconda promt` and hit enter.
3. Navigate to the folder containing this script: `cd <path_to_folder>`
3. Use the requirements.yml file in the root directory to set up the environment: `conda env create -f requirements.yml`
5. Activate the flexus environment: `conda activate flexsus`
5. Open the notebook: `jupyter notebook`

## Description

## Content

## Script set-up

In [17]:
# import packages
import os
import pandas as pd
import time

In [18]:
# make time string current time
timestr = time.strftime("%Y%m%d%H%M")

In [19]:
# make directories
dirs = {'output'}
for i in dirs:
    if not os.path.isdir(i):
        os.mkdir(i)

## Define output resolution

In [20]:
# set of considered countries (if empty, select all)
ccc = {'DENMARK','NORWAY'}

# set of considered regions (if empty, select all)
rrr = {'DK1', 'DK2', 'NO1', 'NO2', 'NO3', 'NO4', 'NO5'}

In [21]:
# set of years (if empty, select all)
yyy = {}  # 2025, 2035, 2045

## Load and process functions

In [22]:
def readSmallData(f):
    return pd.read_csv('data/' + f + '.csv', encoding='utf8', engine='c', low_memory=False)

In [23]:
def readData(f):
    tmp = []
    for chunk in pd.read_csv('data/' + f + '.csv', encoding='utf8', engine='c', low_memory=False,
                             chunksize=500000):
        tmp.append(chunk)
    return pd.concat(tmp, axis=0)
    del tmp

In [24]:
def epsToZeros(df):
    df = df.replace('Eps', 0)
    return df

In [25]:
def filterYearAndCountry(df,y,c):
    if y: df = df.loc[~df.Dim3.isin(set(df.Dim3).difference(y)), :]
    if c: df = df.loc[~df.Dim4.isin(set(df.Dim4).difference(c)), :]
    return df

In [26]:
def makeValFloat(df):
    df.Val = df.Val.astype(float)
    return df

In [27]:
def groupFrame(df,i,f,aver):
    if aver:
        df = df.groupby([i,'Dim2'])['Val'].mean().unstack().T
        df.columns = pd.MultiIndex.from_product([[f + ' (mean)'], df.columns])
    else:
        df = df.groupby([i,'Dim2'])['Val'].sum().unstack().T
        df.columns = pd.MultiIndex.from_product([[f + ' (sum)'], df.columns])
    df = df.fillna(0)
    return df

In [28]:
def groupFrame2(df,f,aver):
    if aver:
        df = df.groupby(['ctr','Dim2'])['Val'].mean().unstack().T
        df.columns = pd.MultiIndex.from_product([[f + ' (mean)'], df.columns])
    else:
        df = df.groupby(['ctr','Dim2'])['Val'].sum().unstack().T
        df.columns = pd.MultiIndex.from_product([[f + ' (sum)'], df.columns])
    df = df.fillna(0)
    return df

## Auxils 

In [29]:
# load variable criteria to be grouped against
ctr = pd.read_csv('auxils/criteria.csv', encoding='utf8', index_col=0)
ctr.head(5)

Unnamed: 0_level_0,group_criteria,group_ccc,aver
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ECONOMY_ELEC_TRANSMISSION,Dim7,yes,no
ECONOMY_GENERATION,Dim11,yes,no
ECONOMY_HEAT_TRANSMISSION,Dim8,yes,no
ELEC_DEMAND,Dim6,yes,no
ELEC_PRICE,Dim7,yes,yes


In [30]:
# get list of data files (w/o file extension, i.e. '.csv')
lf = [i.split('.')[0] for i in os.listdir('data')]

# get list of data files (w/ file extension, i.e. '.csv')
#lf = os.listdir('data')

# list files
print(lf)

['ECONOMY_ELEC_TRANSMISSION', 'ECONOMY_GENERATION', 'ECONOMY_HEAT_TRANSMISSION', 'ELEC_DEMAND', 'ELEC_PRICE', 'ELEC_PRICE_HOURLY', 'ELEC_TRANSMISSION_CAPACITY', 'ELEC_TRANSMISSION_FLOW', 'EMISSIONS_CO2', 'ENERGY_PRODUCTION', 'FUEL_CONSUMPTION', 'GENERATION_CAPACITY', 'HEAT_DEMAND', 'HEAT_PRICE', 'HEAT_PRICE_HOURLY', 'HEAT_TRANSMISSION_CAPACITY', 'HEAT_TRANSMISSION_FLOW', 'STORAGE_CAPACITY', 'SYSTEM_COSTS']


In [31]:
# compare ctr and lf for consitency
for diff in set(ctr.index).difference(set(lf)):
    print(str(diff) + ' is not defined in the criteria.csv file.')

# Process data

## Implicit processing version

In [32]:
# make a data frame for all processed data
df_all = pd.DataFrame()

for f in lf:
    # read the data for each file into a temporary data frame
    df = readData(f)
    # convert any Eps (GAMS specific notation) to zeros
    df = epsToZeros(df)
    # convert all data in the 'val' column to floats
    df = makeValFloat(df)
    
    # select the columns (dimensinos) to group the data against as defined in the criteria.csv file
    dim = ctr.loc[f, 'group_criteria']
    # if more than one dimesion shall be used, they are listed in one cell and seperated by '&'
    if '&' in dim:
        # select the first dimension and save it in a new column ('ctr')
        df['ctr'] = df[dim.split('&')[0]]
        # add the other dimensions subsequently to the 'ctr column'
        for i in dim.split('&')[1:]:
            df['ctr'] += '-' + df[i]
    else:
        # if only one dimensino is specified, save it in a new column ('ctr')
        df['ctr'] = df[dim]
    
    # group only by the specified countries, if indicated in the criteria.csv file, else keep all
    if ctr.loc[f, 'group_ccc'] == 'yes':
        df = filterYearAndCountry(df,yyy,ccc)
    else:
        df = filterYearAndCountry(df,yyy,{})
    
    # calculate the averages for each group (mean), if indicated in the criteria.csv file, else do the sum (default)
    if ctr.loc[f, 'aver'] == 'yes':
        aver = True
    else:
        aver = False
    
    # call the group function
    df = groupFrame2(df,f,aver)
    # add the grouped data to the data frame that holds all grouped data
    df_all = pd.concat([df_all, df], axis=1, sort=False)

# delete naming of index and column
df_all.index.set_names([None], inplace=True)
df_all.columns.set_names([None, None], inplace=True)

# save data to csv
df_all.to_csv('output/{}-data_all.csv'.format(timestr), encoding='utf8')

# show data frame
df_all.head(2)

Unnamed: 0_level_0,ECONOMY_ELEC_TRANSMISSION (sum),ECONOMY_ELEC_TRANSMISSION (sum),ECONOMY_ELEC_TRANSMISSION (sum),ECONOMY_ELEC_TRANSMISSION (sum),ECONOMY_GENERATION (sum),ECONOMY_GENERATION (sum),ECONOMY_GENERATION (sum),ECONOMY_GENERATION (sum),ECONOMY_GENERATION (sum),ECONOMY_GENERATION (sum),...,SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum),SYSTEM_COSTS (sum)
Unnamed: 0_level_1,TRANSMISSION_CAPITAL_COSTS,TRANSMISSION_OPERATIONAL_COSTS,TRANSMISSION_TRADE_COSTS,TRANSMISSION_TRADE_INCOME,ELECTRICITY_SALE,ENERGY_SPECIFIC_REVENUE,GENERATION_CAPITAL_COSTS,GENERATION_CO2_TAX,GENERATION_FIXED_COSTS,GENERATION_FUEL_COSTS,...,GENERATION_OPERATIONAL_COSTS,GENERATION_OTHER_EMI_TAX,GENERATION_UC_COSTS,GRID_TARIFFS,HEAT_TRANSMISSION_CAPITAL_COSTS,HEAT_TRANSMISSION_OPERATIONAL_COSTS,HYDRO_PROFILE,TAXES,TRANSMISSION_CAPITAL_COSTS,TRANSMISSION_OPERATIONAL_COSTS
Scenario1,1657.657335,0.065144,8525.846009,33740.028628,44091.194561,44814.327069,12342.189366,231.548661,6357.047931,3984.628893,...,2662.357902,0.020172,144.105256,9975.205058,0.0,0.000308,-126.85874,12776.638785,1657.657335,0.065144
Scenario10,1531.774992,0.05452,8968.41567,29660.490472,37248.493363,42697.123825,9017.136737,355.756872,4834.37668,2311.256359,...,2478.124542,0.0123,159.391437,10162.589376,0.0,0.000601,-140.707859,14659.872827,1531.774992,0.05452


## Explicit processing version (not uesed)

Can be activated by selecting all cells below and pressing "r" on the keyboard (on command mode, *blue*)