In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
from pprint import pprint
import gc
from os import path
import requests
import shutil

In [2]:
# Locations corresponding to each of the NAM forecasts (140x140x2)
# nam_locations = np.load("nam_locations.npy")

# Latitude and longitude are in the opposite order from the coordinate data for the wind farms
# This flips them so that everything is matching
# nam_locations = nam_locations[:,:,::-1]

# np.save("nam_locations_flipped.npy", nam_locations)

In [11]:
# The 32 shards of the NAM weather forecasting model output (24049x140x140x9)
# Each entry covers the entire 140x140 grid, indicating forecasted value
# for 9 weather variables in each time stamp
# The meaning of the 9 variables that NAM predicts:
# NAM_FIELDS = [
#     "accum_precip",
#     "dewpoint_temperature_2m",
#     "downward_short_wave_flux",
#     "relative_humidity_2m",
#     "sea_level_pressure",
#     "temperature_2m",
#     "total_cloud_cover",
#     "u_wind_80m",
#     "v_wind_80m",
# ]

all_nam_data = []
for i in range(32):
    data_file_name = "nam_data-000" + "{:02d}".format(i) + "-of-00032.npy"
    if not path.exists("nam_data/" + data_file_name):
        # Download the part file
        r = requests.get("http://storage.googleapis.com/gridmatic/roscoe/" + data_file_name, stream=True)
        with open("nam_data/" + data_file_name, 'wb') as fin:
            shutil.copyfileobj(r.raw, fin)
            fin.close()
    all_nam_data.extend(np.load("nam_data/" + data_file_name))

In [12]:
# A one-dimensional list of the timestamps for the NAM inputs (24049x1)
# The dimension is the number of 1 HOUR INTERVALS in the time period
nam_timestamps = pd.read_pickle("nam_timestamps.pkl")

In [13]:
# Create a Pandas dataframe of the timestamps
nam_df = pd.DataFrame({'timestamps': nam_timestamps, 'data_objects': all_nam_data});
nam_df = nam_df.set_index('timestamps');
del nam_timestamps
del all_nam_data
gc.collect()

7

In [14]:
# NAM has a lot of hours/dates missing
# We initially thought about cutting the ERCOT down to match, but decided to take advantage of the full NAM dataset
# We therefore generate new NAM data so that it more closely matches the data from ERCOT
# We generate hourly data, as a middle ground between days of missing NAM data and the 15-minute granularity of ERCOT

# OLD SOLUTION WITH PADDING: Just copy one value into all subsequent consecutive hours that are missing
# nam_padded_df = nam_df.resample('H').pad();
# Now we just do linear interpolation, so this can be ignored
# But this is useful to keep as a reference to an earlier technique

# Create entries for the missing hours (initially with a value of NaN)
nam_unfilled_df = nam_df.resample('H').asfreq();
del nam_df

gc.collect()

11

In [15]:
import math
def find_closest_value(adj_time, direction):
    i = 0
    adj_value = nam_unfilled_df.iloc[nam_unfilled_df.index.get_loc(adj_time)].iloc[0]
    while (type(adj_value) is float and math.isnan(adj_value)):
        i += 1
        adj_time += direction * pd.Timedelta(hours=1)        
        adj_value = nam_unfilled_df.iloc[nam_unfilled_df.index.get_loc(adj_time)].iloc[0]
    return (adj_value,i)

def generate_new_value(timestamp):
    prev_value, prev_dist = find_closest_value(timestamp, -1)
    next_value, next_dist = find_closest_value(timestamp, 1)
    percentage = 1 if (prev_dist + next_dist) == 0 else prev_dist / (prev_dist + next_dist)
    return np.add(prev_value, (np.subtract(next_value, prev_value) * percentage))

def fill_in_missing_time(slice_of_df):
    value = slice_of_df.iloc[0]
    timestamp = slice_of_df.name
    timestamp = pd.Timestamp(timestamp) if isinstance(timestamp, np.datetime64) else timestamp
    new_val = generate_new_value(timestamp)
    slice_of_df.iloc[0] = new_val
    return slice_of_df

In [16]:
# Generate the new data to fill the NaN values
nam_interpolated_df = nam_unfilled_df.apply(fill_in_missing_time, axis=1)
del nam_unfilled_df
gc.collect()

0

In [17]:
# Print the final NAM dataset, with generated data
# pprint(nam_interpolated_df[:24])
gc.collect()

20

In [18]:
# Split the NAM data into training and testing
# split_df = nam_interpolated_df.copy(deep=True)
# split_df = nam_interpolated_df
nam_interpolated_df['numerical_index'] = range(0, len(nam_interpolated_df))

In [19]:
nam_training_df = nam_interpolated_df[nam_interpolated_df['numerical_index'] % 25 != 0]
nam_testing_df = nam_interpolated_df[nam_interpolated_df['numerical_index'] % 25 == 0]
del nam_interpolated_df
gc.collect()

60

In [20]:
# Drop the numerical_index
nam_training_df = nam_training_df.drop('numerical_index', 1)
nam_testing_df = nam_testing_df.drop('numerical_index', 1)

In [21]:
# Print the training and testing datasets out, to ensure that it still has values after we delete nam_interpolated_df
pprint(nam_testing_df[:25])
# pprint(nam_training_df[:25])

                                                                data_objects
timestamps                                                                  
2015-01-01 00:00:00-06:00  [[[0.0, 290.7572, 0.0, 75.0, 101402.0, 295.582...
2015-01-02 01:00:00-06:00  [[[0.0, 290.68527, 0.0, 77.0, 101398.0, 294.99...
2015-01-03 02:00:00-06:00  [[[0.0, 285.07336, 0.0, 59.0, 101692.0, 293.30...
2015-01-04 03:00:00-06:00  [[[0.0, 287.89383, 0.0, 68.0, 101669.0, 294.14...
2015-01-05 04:00:00-06:00  [[[0.0, 288.7345, 0.0, 66.0, 101515.0, 295.621...
2015-01-06 05:00:00-06:00  [[[0.0, 292.64038, 0.0, 80.0, 101687.0, 296.63...
2015-01-07 06:00:00-06:00  [[[0.0, 290.5164, 0.0, 76.0, 101596.0, 295.190...
2015-01-08 07:00:00-06:00  [[[0.0, 292.16656, 0.0, 81.0, 101441.0, 295.68...
2015-01-09 08:00:00-06:00  [[[0.25, 290.54825, 2.375, 73.0, 101470.0, 295...
2015-01-10 09:00:00-06:00  [[[0.0, 289.55475, 194.625, 70.0, 101595.0, 29...
2015-01-11 10:00:00-06:00  [[[0.0, 290.60452, 405.25, 76.0, 101868.0, 295...

In [22]:
# Save the split training and testing data to disk
cleaned_data = "cleaned_data/"
training_data_filename = "nam_training_"
testing_data_filename = "nam_testing_"
extension = ".pkl.gz"

In [31]:
# write out the testing data into parts
part_length = nam_testing_df.shape[0] // 32
# print(part_length)
for i in range(32):
    testing_data_part = nam_testing_df.iloc[part_length*i:] if (i == 31) else nam_testing_df.iloc[part_length*i:part_length*(i + 1)]
    testing_data_part.to_pickle(cleaned_data + testing_data_filename + str(i) + extension, compression='gzip')
    del testing_data_part
    gc.collect()

In [32]:
part_length = nam_training_df.shape[0] // 32
for i in range(32):
    training_data_part = nam_training_df.iloc[part_length*i:] if i == 31 else nam_training_df.iloc[part_length*i:part_length*(i + 1)]
    training_data_part.to_pickle(cleaned_data + training_data_filename + str(i) + extension, compression='gzip')
    del training_data_part
    gc.collect()