In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

ELECTRIC_DIR = os.path.join(os.path.abspath("./historical_data"), "electric_data")
DATA_FILE = os.path.join(ELECTRIC_DIR, "psco.json")

df: pd.DataFrame = pd.read_json(DATA_FILE, typ="frame", orient="records", convert_dates=["dates"])
df.set_index("date", inplace=True)

In [None]:
## See what would get cut off using various inter-quartile range metrics
# calculate inter-quartile range
iqr = (df.quantile(0.75, numeric_only=False) - df.quantile(0.25, numeric_only=False))['demand']

median = df.median(numeric_only=True)['demand']

# Find values within `iqr_mult` multiples of the median
iqr_mult = 3
min_demand = median - (iqr * iqr_mult)
max_demand = median + (iqr * iqr_mult)
print(f"inter-quartile range: {iqr}, median: {median}, valid data range: {min_demand} - {max_demand}")
df['abs_diff'] = abs(df['demand'] - median)
df['iqr_mult'] = df['abs_diff'] / iqr

In [None]:
# Look at the biggest outliers
df.sort_values("abs_diff", ascending=False).head(20)

In [None]:
## Kind of a weird metric here, but it seems to work
good_criterion = df['demand'].map(lambda d: d > 1000 and d < 11000)
bad_criterion = ~good_criterion

# Print invalid rows
df[bad_criterion]

In [None]:
# Print valid rows
df[good_criterion]


In [None]:
# replace outliers with nan
df = df.where(good_criterion, np.nan)

# spot check
df.loc["2015-07-02 06:00:00"]

In [None]:
# interpolate missing values
df.interpolate(inplace=True)

In [None]:
# re-spot check
df.loc["2015-07-02 06:00:00"]


In [None]:
# spot check 2
df.loc['2022-07-19']

In [None]:
tmp_grouped = df.groupby(lambda x: x.date, sort=False, as_index=True)
grouped = tmp_grouped.agg(
    daily_demand=("demand", np.sum),
    num_hours_reported=("demand", np.count_nonzero)
)

grouped.reset_index(inplace=True)
grouped[["index"]] = grouped[["index"]].astype(np.datetime64)
grouped.set_index("index", inplace=True)

In [None]:
# Drop days with less than 24 hours of data (usually first & last day of range)
grouped = grouped[grouped.num_hours_reported == 24].drop(labels="num_hours_reported", axis=1)

In [None]:
grouped_file_path = os.path.join(ELECTRIC_DIR, "psco-daily-dataframe_test.json")
grouped.to_json(grouped_file_path)