Download historical data from EIA and NOAA/GHCN-d to the local filesystem

In [None]:
import download_historical_data as dl
import os 
import matplotlib.pyplot as plt
import pandas as pd

HISTORICAL_DATA_DIR = os.path.abspath("./historical_data")
ANALYSIS_DATA_DIR = os.path.abspath("./analysis_data/")
ELECTRIC_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "electric_data")
WEATHER_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "weather_station_data")

for dir in [HISTORICAL_DATA_DIR, ANALYSIS_DATA_DIR, ELECTRIC_DATA_DIR, WEATHER_DATA_DIR]:
    if not os.path.exists(dir):
        os.makedirs(dir)


WEATHER_STATION_IDS = [
    "USW00023066",  # Grand Junction Walker Field
    "USC00053553",  # Greeley UNC
    "USC00053005",  # Ft Collins
    "USC00050848",  # Boulder
    "USC00055984",  # Northglenn
    "USC00058995",  # Wheat Ridge
    "USW00023061"  # Alamosa
]

# Uncomment to force re-download of source data
# Otherwise can also run the download script manually via: python download_historical_data.py
# Data files are saved locally so you only need to re-download to get new/different data
#dl.download_eia_historical_data(ELECTRIC_DATA_DIR, eia_respondent="PSCO")
#dl.download_ghcnd_historical_data(WEATHER_DATA_DIR, WEATHER_STATION_IDS)

In [None]:
import glob

plt.style.use("default") #alternative "ggplot"

temp_df : pd.DataFrame = None

## Load up temperature data for each weather station, into their own columns
for df_file in glob.glob(WEATHER_DATA_DIR + "\*.json"):
    with open(df_file, "r", encoding="utf-8") as f:
        station_id = os.path.basename(df_file)[0:11]
        station_df = pd.read_json(f)
        station_df.index.rename("date", inplace=True)
        
        # TODO: This name-mangling seems like a halfassed way to either do a MultiIndex or maybe a tuple-index
        # Going to leave it for now as I'm not clear what will be easiest when trying to train an ML model
        col_renames = {col: f"{station_id}_{col}" for col in station_df.columns}
        station_df.rename(col_renames, axis="columns", inplace=True)
        
        if temp_df is not None:
            temp_df = pd.merge(left=temp_df, right=station_df, how="outer", left_index=True, right_index=True)
        else:
            temp_df = station_df

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.set_figheight(4)
ax1.plot(temp_df)
_ = ax2.hist(temp_df, bins=10)


Load electric demand data

In [None]:
psco_demand_data_file = os.path.join(ELECTRIC_DATA_DIR, "psco-daily-dataframe.json")
with open(psco_demand_data_file, "r", encoding="utf-8") as f:
    demand_df = pd.read_json(f)

Basic statistics on daily demand

In [None]:
demand_df["daily_demand"].describe()

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)
fig.set_figwidth(15)
fig.set_figheight(4)
ax1.plot(demand_df)
ax2.hist(demand_df, bins=50)
pd.plotting.autocorrelation_plot(demand_df, ax=ax3).set_xlim([0,365])
pd.plotting.autocorrelation_plot(demand_df, ax=ax4).set_xlim([0,7])

In [None]:
joined_df = pd.merge(demand_df, temp_df, how="outer", left_index=True, right_index=True)
joined_df.dropna(inplace=True)

## Graphs demand vs tmin & max, which is harder with many weather stations of data
# slice = joined_df[["daily_demand", "tmax", "tmin"]]  #["2016-12-01":"2017-03-01"]

# fig, ax1 = plt.subplots(1, 1)
# fig.set_figwidth(15)
# fig.set_figheight(4)
# ax1.set_ylabel("megawatt-hours")
# ax1.plot(slice["daily_demand"], color="tab:green")

# ax2 = ax1.twinx()
# ax2.set_ylabel("deg C")
# ax2.plot(slice["tmax"], color="tab:red")
# ax2.plot(slice["tmin"], color="tab:blue")


In [None]:
joined_df.corr(numeric_only=True)