In [None]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [None]:
df = pd.read_csv("ALL_2019.csv", parse_dates=['DATE'])

In [None]:
df.head()

In [None]:
# convert 10th of a degree to degree F
df['TMPF'] = (df['TMP']/10) *(9/5) + 32

In [None]:
temps = ['TMPF', 'TMP_0:06', 'TMP_0:09','TMP_0:12', 'TMP_0:15', 'TMP_0:18', 'TMP_0:21', 'TMP_1:00', 'TMP_1:03',
       'TMP_1:06', 'TMP_1:09', 'TMP_1:12', 'TMP_1:15', 'TMP_1:18', 'TMP_1:21',
       'TMP_2:00', 'TMP_2:03', 'TMP_2:06', 'TMP_2:09', 'TMP_2:12', 'TMP_2:18',
       'TMP_3:00']

In [None]:
def get_hours(name):
    _, time = name.split("_")
    days, hours = time.split(':')
    return (int(hours) + 24*int(days))

In [None]:
new_cols = dict((t,get_hours(t)) for t in temps[1:])
new_cols.update({'TMPF':"baseline"})

In [None]:
dfr = df.rename(columns=new_cols)
dfr.head()

In [None]:
dfr['DATE'].max()

## Plot JFK to check

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
for col in new_cols.values():
    station = dfr[dfr['station'].str.match('KJFK')][['DATE',col]].set_index('DATE')
    if col == 'baseline':
        station.plot(ax=ax, color='k', zorder=10, x_compat=True)
    else:
        station.plot(ax=ax, linewidth=.5, linestyle='--', x_compat= True)
        
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%B %d"))
ax.tick_params('x', labelrotation=0)
for label in ax.get_xticklabels():
    label.set_horizontalalignment('center')
ax.legend(title="Forecast Ahead (hrs)", ncol=8, loc = 'lower center')
ax.set_xlim((pd.Timestamp('2018-12-31').toordinal(), pd.Timestamp('2019-12-31').toordinal()))
ax.grid()
ax.set_xlabel(None)

# Plot Observation Grids

In [None]:
for col in new_cols.values():
    temp_grid = dfr[['station', 'DATE', col]].pivot(index='station', columns='DATE', values=col)
    
    fig, ax = plt.subplots(figsize=(10,10))
    if col=='baseline':
        title='GHCN observations'
    else:
        title=f'{col} Hrs'
    ax.set_title(title)
    im = ax.pcolormesh(temp_grid, vmin=-30, vmax=100, cmap='coolwarm')
    ax.set(ylabel="stations", xlabel="day of year")
    ax.set_yticks(range(len(temp_grid.index)))
    ax.set_yticklabels(temp_grid.index, fontsize=4)
    ax.tick_params(axis='y', length=0)
    fig.colorbar(im, ax=ax)
    fig.savefig(f"{col}.png")

# Error images

In [None]:
baseline = df[['station', 'DATE', 'TMPF']].pivot(index='station', columns='DATE', values='TMPF')

for col in new_cols.values():
    if col == 'baseline':
        continue
    temp_grid = dfr[['station', 'DATE', col]].pivot(index='station', columns='DATE', values=col)
    fig, ax = plt.subplots()
    ax.set_title(f'{col} hour (forecast) - temperature (GHCN)')
    im = ax.pcolormesh(temp_grid-baseline, vmin=-25, vmax=25,  cmap='RdBu_r')
    ax.set(ylabel="stations", xlabel="day of year")
    ax.set_yticklabels([]) # removed individual station ids cause not super helpful here
    ax.tick_params(axis='y', length=0)
    fig.colorbar(im, ax=ax)
    fig.savefig(f"ghcn_minius_{col}.png")

# Histograms

In [None]:
baseline = df[['station', 'DATE', 'TMPF']].pivot(index='station', columns='DATE', values='TMPF')

In [None]:
fig, ax = plt.subplots()
for col in new_cols.values():
    if col == 'baseline':continue
    temp_grid = dfr[['station', 'DATE', col]].pivot(index='station', columns='DATE', values=col)
    ax.hist((temp_grid-baseline).values.ravel(), label=col, histtype='step') 

ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',
           ncol=5, mode="expand", borderaxespad=0., title='hours')
fig.savefig(f"hist.png")

The above end sup so noisy it's kind of hard to see the differences, so instead we're going to try a technique called small multiples where we plot each histogram underneath the other, all on the same x and y intervals.

In [None]:
# https://matplotlib.org/matplotblog/posts/create-ridgeplots-in-matplotlib/
fig, ax = plt.subplots(figsize=(5,10), nrows=len(temps[1:]), sharex=True, sharey=True, 
                       constrained_layout=True)
fig.suptitle("Histogram of Forecast Errors")
for ax, col in zip(ax, new_cols.values()):
    if col=='baseline':continue
    temp_grid = dfr[['station', 'DATE', col]].pivot(index='station', columns='DATE', values=col)
    sns.distplot((temp_grid-baseline).values.ravel(), 
                 hist_kws={'histtype':'step'}, ax=ax)
    ax.axvline(x=0, color='darkgrey')
   
    ax.annotate(f'{col} Hrs', (-60,0.1))
    # we more care about the shape then the y values
    ax.set_yticklabels([])
    ax.tick_params(axis='y', length=0)

fig.subplots_adjust(hspace=0)
fig.savefig(f"hist_stacked.png")