# Data Health Check

Check if data behaves as we expect

In [1]:
import altair as alt
from preprocessors.paths import (PATH_TO_PLANT_A,
                                 PATH_TO_PLANT_B,
                                 PATH_TO_PLANT_C,
                                 PATH_TO_WEATHER)
from preprocessors.plant_preprocessor import PlantPreprocessor
from preprocessors.weather_preprocessor import WeatherPreprocessor

In [2]:
power_plant_a = PlantPreprocessor(PATH_TO_PLANT_A)
power_plant_b = PlantPreprocessor(PATH_TO_PLANT_B)
power_plant_c = PlantPreprocessor(PATH_TO_PLANT_C)
weather = WeatherPreprocessor(PATH_TO_WEATHER)

In [3]:
print('power_plant_a columns:', list(power_plant_a.df_indexed_utc.columns),
      'power_plant_b columns:', list(power_plant_b.df_indexed_utc.columns),
      'power_plant_c columns:', list(power_plant_c.df_indexed_utc.columns),
      'weather columns:', list(weather.df_indexed_utc.columns), 
      sep='\n')

power_plant_a columns:
['generation_kw', 'grid_feedin_kw', 'grid_supply_kw', 'overall_consumption_calc_kw']
power_plant_b columns:
['generation_kw', 'grid_feedin_kw', 'grid_supply_kw', 'overall_consumption_calc_kw']
power_plant_c columns:
['grid_feedin_kw', 'grid_supply_kw']
weather columns:
['temperature', 'precipitation', 'snowfall', 'snow_mass', 'air_density', 'radiation_surface', 'radiation_toa', 'cloud_cover']


In [6]:
print('power_plant_a timestamp_utc:', power_plant_a.df_indexed_utc.index,
      'power_plant_b timestamp_utc:', power_plant_b.df_indexed_utc.index,
      'power_plant_c timestamp_utc:', power_plant_c.df_indexed_utc.index,
      'weather timestamp_utc:', weather.df_indexed_utc.index, 
      sep='\n')

power_plant_a timestamp_utc:
DatetimeIndex(['2018-12-31 23:00:00+00:00', '2019-01-01 00:00:00+00:00',
               '2019-01-01 01:00:00+00:00', '2019-01-01 02:00:00+00:00',
               '2019-01-01 03:00:00+00:00', '2019-01-01 04:00:00+00:00',
               '2019-01-01 05:00:00+00:00', '2019-01-01 06:00:00+00:00',
               '2019-01-01 07:00:00+00:00', '2019-01-01 08:00:00+00:00',
               ...
               '2019-12-31 13:00:00+00:00', '2019-12-31 14:00:00+00:00',
               '2019-12-31 15:00:00+00:00', '2019-12-31 16:00:00+00:00',
               '2019-12-31 17:00:00+00:00', '2019-12-31 18:00:00+00:00',
               '2019-12-31 19:00:00+00:00', '2019-12-31 20:00:00+00:00',
               '2019-12-31 21:00:00+00:00', '2019-12-31 22:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='timestamp_utc', length=8760, freq=None)
power_plant_b timestamp_utc:
DatetimeIndex(['2018-12-31 23:00:00+00:00', '2019-01-01 00:00:00+00:00',
               '2019-01-01 01:

In [10]:
print('power_plant_a NA count:', power_plant_a.df_indexed_utc.isna().sum(),
      'power_plant_b NA count:', power_plant_b.df_indexed_utc.isna().sum(),
      'power_plant_c NA count:', power_plant_c.df_indexed_utc.isna().sum(),
      'weather NA count:', weather.df_indexed_utc.isna().sum(), 
      sep='\n\n')

power_plant_a NA count:

generation_kw                  0
grid_feedin_kw                 0
grid_supply_kw                 0
overall_consumption_calc_kw    0
dtype: int64

power_plant_b NA count:

generation_kw                  0
grid_feedin_kw                 0
grid_supply_kw                 0
overall_consumption_calc_kw    0
dtype: int64

power_plant_c NA count:

grid_feedin_kw    0
grid_supply_kw    0
dtype: int64

weather NA count:

temperature          0
precipitation        0
snowfall             0
snow_mass            0
air_density          0
radiation_surface    0
radiation_toa        0
cloud_cover          0
dtype: int64
