In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
# Locations corresponding to each of the NAM forecasts (140x140x2)
nam_locations = np.load("nam_locations.npy")

# Latitude and longitude are in the opposite order from the coordinate data for the wind farms
# This flips them so that everything is matching
nam_locations = nam_locations[:,:,::-1]

In [3]:
from os import path
import requests
import shutil

# The 32 shards of the NAM weather forecasting model output (24049x140x140x9)
# Each entry covers the entire 140x140 grid, indicating forecasted value
# for 9 weather variables in each time stamp
# The meaning of the 9 variables that NAM predicts:
# NAM_FIELDS = [
#     "accum_precip",
#     "dewpoint_temperature_2m",
#     "downward_short_wave_flux",
#     "relative_humidity_2m",
#     "sea_level_pressure",
#     "temperature_2m",
#     "total_cloud_cover",
#     "u_wind_80m",
#     "v_wind_80m",
# ]

all_nam_data = []
for i in range(32):
    data_file_name = "nam_data-000" + "{:02d}".format(i) + "-of-00032.npy"
    if not path.exists("nam_data/" + data_file_name):
        # Download the part file
        r = requests.get("http://storage.googleapis.com/gridmatic/roscoe/" + data_file_name, stream=True)
        with open("nam_data/" + data_file_name, 'wb') as fin:
            shutil.copyfileobj(r.raw, fin)
    all_nam_data.extend(np.load("nam_data/" + data_file_name))

In [4]:
# A one-dimensional list of the timestamps for the NAM inputs (24049x1)
# The dimension is the number of 1 HOUR INTERVALS in the time period
nam_timestamps = pd.read_pickle("nam_timestamps.pkl")

In [5]:
# Create a Pandas dataframe of the timestamps
nam_df = pd.DataFrame({'timestamps': nam_timestamps, 'data_objects': all_nam_data});
nam_df = nam_df.set_index('timestamps');

In [6]:
# NAM has a lot of hours/dates missing
# We initially thought about cutting the ERCOT down to match, but decided to take advantage of the full NAM dataset
# We therefore generate new NAM data so that it more closely matches the data from ERCOT
# We generate hourly data, as a middle ground between days of missing NAM data and the 15-minute granularity of ERCOT

# OLD SOLUTION WITH PADDING: Just copy one value into all subsequent consecutive hours that are missing
# nam_padded_df = nam_df.resample('H').pad();
# Now we just do linear interpolation, so this can be ignored
# But this is useful to keep as a reference to an earlier technique

# Create entries for the missing hours (initially with a value of NaN)
nam_unfilled_df = nam_df.resample('H').asfreq();

In [7]:
import math
def find_closest_value(adj_time, direction):
    i = 0
    adj_value = nam_unfilled_df.iloc[nam_unfilled_df.index.get_loc(adj_time)].iloc[0]
    while (type(adj_value) is float and math.isnan(adj_value)):
        i += 1
        adj_time += direction * pd.Timedelta(hours=1)        
        adj_value = nam_unfilled_df.iloc[nam_unfilled_df.index.get_loc(adj_time)].iloc[0]
    return (adj_value,i)

def generate_new_value(timestamp):
    prev_value, prev_dist = find_closest_value(timestamp, -1)
    next_value, next_dist = find_closest_value(timestamp, 1)
    percentage = 1 if (prev_dist + next_dist) == 0 else prev_dist / (prev_dist + next_dist)
    return np.add(prev_value, (np.subtract(next_value, prev_value) * percentage))

def fill_in_missing_time(slice_of_df):
    value = slice_of_df.iloc[0]
    timestamp = slice_of_df.name
    timestamp = pd.Timestamp(timestamp) if isinstance(timestamp, np.datetime64) else timestamp
    new_val = generate_new_value(timestamp)
    slice_of_df.iloc[0] = new_val
    return slice_of_df

In [None]:
# Generate the new data to fill the NaN values
nam_interpolated_df = nam_unfilled_df.apply(fill_in_missing_time, axis=1)

In [None]:
# Print the final NAM dataset, with generated data
# pprint(nam_interpolated_df[:24])

In [None]:
# Split the NAM data into training and testing
split_df = nam_interpolated_df.copy(deep=True)
split_df['numerical_index'] = range(0, len(split_df))

In [None]:
nam_training_df = split_df[split_df['numerical_index'] % 25 != 0]
nam_testing_df = split_df[split_df['numerical_index'] % 25 == 0]

In [None]:
# Drop the numerical_index
nam_training_df = nam_training_df.drop('numerical_index', 1)
nam_testing_df = nam_testing_df.drop('numerical_index', 1)

In [None]:
# Print the training and testing datasets out
# pprint(nam_testing_df[:25])
# pprint(nam_training_df[:25])

In [None]:
# Save the split training and testing data to disk
cleaned_data = "cleaned_data/"
training_data_filename = "nam_training_1_of_2.pkl"
testing_data_filename = "nam_testing_2_of_2.pkl"

nam_training_df.to_pickle(cleaned_data + training_data_filename)
nam_testing_df.to_pickle(cleaned_data + testing_data_filename)
