In [1]:
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
from bokeh.io import output_notebook
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

from nexml_iso.notebooks import utils

ModuleNotFoundError: No module named 'nexml_iso'

In [None]:
pal_df = pd.read_csv('../data/nyiso_pal_master.csv')
isolf_df = pd.read_csv('../data/nyiso_isolf_master.csv')
pal_df.head()

In [None]:
pal_df['Time Stamp'] = pd.to_datetime(pal_df['Time Stamp'])
pal_df.columns.tolist()

In [None]:
isolf_df.rename(columns={'min': 'isolf_min', 'max': 'isolf_max', 'mean': 'isolf_mean'}, inplace=True)
isolf_df['Time Stamp'] = pd.to_datetime(isolf_df['Time Stamp'])
isolf_df.columns.tolist()

In [None]:
# We can merge our two datasets and add some nuance to our dates
df = pal_df.merge(isolf_df, sort=True)
df.drop(['PTID', 'Name'], axis='columns', inplace=True)
df['weekday'] = df['Time Stamp'].dt.weekday
df['week'] = df['Time Stamp'].dt.week
df['month'] = df['Time Stamp'].dt.month
df['year'] = df['Time Stamp'].dt.year
df.info()

In [None]:
corr = df.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(data=corr, ax=ax, annot=True)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(20,10), sharex=True, sharey=True)
axes[0].plot(df['Time Stamp'], df['pal_mean'])
axes[0].set_title('actual')
axes[1].plot(df['Time Stamp'], df['isolf_mean'], c='red', alpha=.5)
axes[1].set_title('forecast')
axes[2].plot(df['Time Stamp'], df['pal_mean'], label='actual', alpha=.75, ms=.5)
axes[2].plot(df['Time Stamp'], df['isolf_mean'], c='red', alpha=.25, label='forecast', ms=.5)
axes[2].set_title('overlay')
axes[2].legend()
fig.text(.075, .5, 'Load in Megawatts(MW)', ha='center', va='center', rotation='vertical')
plt.show()

#Lots of overlap! Means the ISO has done a good job forecasting so far.
# But outside of the seasonal cycles, we don't see a pronounced drop or increase over time. We can assume 
# probably assume relatively stability for now

In [None]:
# Let's check out the regularity of our load data
mask = (df['Time Stamp'] > datetime.datetime(2018, 1, 1)) & (df['Time Stamp'] < datetime.datetime(2020, 1, 1))
plt.plot(df.loc[mask]['Time Stamp'], df.loc[mask]['pal_max'], ms=.5, label='max')
plt.plot(df.loc[mask]['Time Stamp'], df.loc[mask]['pal_min'], ms=.5, label='min')
plt.title('Actual Load min and max for 2018 and 2019')
plt.legend()
plt.show()

In [None]:
# Finally, let's check out the frequency of our load in histogram form
fig, ax = plt.subplots()
ax.set_xlabel('Load (MW)')
ax.set_ylabel('Number of Days')
ax.hist(df['pal_mean'], 50)
plt.show()

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(12,8), sharey=True)
sns.violinplot(df['year'], df['pal_mean'], ax=ax[0])
sns.violinplot(df['month'], df['pal_mean'], ax=ax[1])
plt.show()

# Here again we can see our distribution play out across multiple years settling between around our mean at 5.5k
# We can also see that most of the variation (our tails) can be accounted for in the summer months

In [None]:
# Another interesting point is that in month 4 and 10 (april and october) are local minimums for our line

months = df.groupby('month').mean()
# plt.plot()

In [None]:
 # We also notice that weekday has some correlation to our load value. Let's check it out further

fig, ax = plt.subplots(figsize=(12, 6))
sns.violinplot(df['weekday'], df['pal_mean'])
plt.show()

Wonderful! We see that the weekends (weekday 5 and 6) are shaped similar, but have a lower center.

We can also guess that our seasonal variation discovered above an be found by following the month vector. Let's confirm.