# 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.data.load_dataset import load_epacems, load_epa_crosswalk
from src.features.build_features import uptime_events, calc_distance_from_downtime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## CEMS Processing

In [12]:
# all states, 1 year
cols = ['plant_id_eia', 'unitid', 'operating_datetime_utc',
        'operating_time_hours', 'gross_load_mw', 'steam_load_1000_lbs',
        'heat_content_mmbtu', 'unit_id_epa',
       ]
cems = load_epacems(states=None, years=[2019], columns=cols, engine='pandas')

In [13]:
%%time
idx = pd.IndexSlice
cems.sort_values(by=['unit_id_epa', 'operating_datetime_utc'], inplace=True)
cems.set_index(['unit_id_epa', 'operating_datetime_utc'], drop=False, inplace=True)#, verify_integrity=True)

### Calculate ramps and distance from downtime

In [14]:
%time calc_distance_from_downtime(cems) # in place
print('')

CPU times: user 19.2 s, sys: 4.43 s, total: 23.6 s
Wall time: 23.5 s


Unnamed: 0_level_0,Unnamed: 1_level_0,plant_id_eia,unitid,operating_datetime_utc,operating_time_hours,gross_load_mw,steam_load_1000_lbs,heat_content_mmbtu,unit_id_epa,hours_from_startup,hours_to_shutdown
unit_id_epa,operating_datetime_utc,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2019-01-01 06:00:00+00:00,3,1,2019-01-01 06:00:00+00:00,0.0,0.0,,0.0,1,24.0,1467.0
1,2019-01-01 07:00:00+00:00,3,1,2019-01-01 07:00:00+00:00,0.0,0.0,,0.0,1,25.0,1466.0
1,2019-01-01 08:00:00+00:00,3,1,2019-01-01 08:00:00+00:00,0.0,0.0,,0.0,1,26.0,1465.0
1,2019-01-01 09:00:00+00:00,3,1,2019-01-01 09:00:00+00:00,0.0,0.0,,0.0,1,27.0,1464.0
1,2019-01-01 10:00:00+00:00,3,1,2019-01-01 10:00:00+00:00,0.0,0.0,,0.0,1,28.0,1463.0
...,...,...,...,...,...,...,...,...,...,...,...
91323,2020-01-01 03:00:00+00:00,10294,GTG,2020-01-01 03:00:00+00:00,0.0,0.0,,0.0,91323,941.0,28.0
91323,2020-01-01 04:00:00+00:00,10294,GTG,2020-01-01 04:00:00+00:00,0.0,0.0,,0.0,91323,942.0,27.0
91323,2020-01-01 05:00:00+00:00,10294,GTG,2020-01-01 05:00:00+00:00,0.0,0.0,,0.0,91323,943.0,26.0
91323,2020-01-01 06:00:00+00:00,10294,GTG,2020-01-01 06:00:00+00:00,0.0,0.0,,0.0,91323,944.0,25.0


In [15]:
units = cems.groupby(level="unit_id_epa")
# ramp rate: MW / hour
cems['ramp_rate'] = units["gross_load_mw"].transform(lambda x: x.diff())

In [16]:
cems.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,plant_id_eia,unitid,operating_datetime_utc,operating_time_hours,gross_load_mw,steam_load_1000_lbs,heat_content_mmbtu,unit_id_epa,hours_from_startup,hours_to_shutdown,ramp_rate
unit_id_epa,operating_datetime_utc,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2019-01-01 06:00:00+00:00,3,1,2019-01-01 06:00:00+00:00,0.0,0.0,,0.0,1,24.0,1467.0,
1,2019-01-01 07:00:00+00:00,3,1,2019-01-01 07:00:00+00:00,0.0,0.0,,0.0,1,25.0,1466.0,0.0
1,2019-01-01 08:00:00+00:00,3,1,2019-01-01 08:00:00+00:00,0.0,0.0,,0.0,1,26.0,1465.0,0.0
1,2019-01-01 09:00:00+00:00,3,1,2019-01-01 09:00:00+00:00,0.0,0.0,,0.0,1,27.0,1464.0,0.0
1,2019-01-01 10:00:00+00:00,3,1,2019-01-01 10:00:00+00:00,0.0,0.0,,0.0,1,28.0,1463.0,0.0


In [17]:
cems['hours_distance'] = (cems[['hours_from_startup', 'hours_to_shutdown']].min(axis=1))

In [22]:
cems['nearest_to_startup'] = cems['hours_from_startup'] < cems['hours_to_shutdown']

In [23]:
# randomly allocate midpoints
rng = np.random.default_rng(seed=42)
rand_midpoints = (cems['hours_from_startup'] == cems['hours_to_shutdown']) & rng.choice(np.array([True, False]), size=len(cems))
cems.loc[rand_midpoints, 'is_startup'] = True
del rand_midpoints

### Join Crosswalk

In [24]:
crosswalk = load_epa_crosswalk()

In [26]:
crosswalk.sample(5).T

Unnamed: 0,4273,2313,6517,5610,395
SEQUENCE_NUMBER,3948.0,2148.0,5920.0,5082.0,364.0
CAMD_STATE,TX,SC,SD,AZ,FL
CAMD_FACILITY_NAME,Paris Energy Center,Darlington County,Deer Creek Station,Gila River Power Station,Fort Myers
CAMD_PLANT_ID,50109,3250,56610,55306,612
CAMD_UNIT_ID,HRSG2,1,01,4CTGA,FMCT2A
CAMD_GENERATOR_ID,GEN3,1,02,CTG7,2A
CAMD_NAMEPLATE_CAPACITY,90.0,,154.0,174.0,188.2
CAMD_FUEL_TYPE,Pipeline Natural Gas,Pipeline Natural Gas,Pipeline Natural Gas,Pipeline Natural Gas,Pipeline Natural Gas
CAMD_LATITUDE,33.6968,34.4185,44.3961,32.9761,26.6967
CAMD_LONGITUDE,-95.5577,-80.1657,-96.5333,-112.694,-81.7831


### Uptime analysis

In [18]:
%time events = uptime_events(cems)

CPU times: user 24.5 s, sys: 2.1 s, total: 26.6 s
Wall time: 26.7 s


In [19]:
events.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,startup,shutdown,duration_hours
unit_id_epa,event,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,2019-03-03 03:00:00+00:00,2019-03-03 09:00:00+00:00,6.0
1,1,2019-03-04 18:00:00+00:00,2019-03-08 18:00:00+00:00,96.0
1,2,2019-05-18 18:00:00+00:00,2019-05-30 05:00:00+00:00,275.0
1,3,2019-07-10 11:00:00+00:00,2019-07-18 04:00:00+00:00,185.0
1,4,2019-09-13 01:00:00+00:00,2019-10-01 23:00:00+00:00,454.0


## Aggregate

In [20]:
unit_aggs = (events['duration_hours'].groupby(level='unit_id_epa').agg(['count', 'mean', 'std'])
             .rename(columns={
                 'count': 'n_uptime_events',
                 'mean': 'mean_uptime_duration',
                 'std': 'std_uptime_duration'}
                    )
            )

In [22]:
.clip(upper=10)

In [None]:
# ramp factor: % / hour (ramp rate / obsserved max power)
cems['ramp_factor_obs'] = cems['ramp_rate'].div(units['gross_load_mw'].transform(np.max))

## 