In [1]:
from os import path
import pandas as pd
import numpy as np
import gc

In [2]:
# Obtain the NAM data (split into training and testing)
# NOTE: must have run the "nam_data_preparation.ipynb" notebook beforehand

# Read the training and testing data from disk
cleaned_data = "cleaned_data/"
training_data_filename = "nam_training_"
testing_data_filename = "nam_testing_"
extension = ".pkl.gz"
nam_training_df = None
for i in range(32):
    training_part = pd.read_pickle(cleaned_data + training_data_filename + str(i) + extension, compression='gzip')
    nam_training_df = training_part if nam_training_df is None else nam_training_df.append(training_part)
    del training_part
    gc.collect()

In [3]:
from pprint import pprint

# Print the training dataset out
pprint(nam_training_df[:5])
pprint(nam_training_df[-5:])

                                                                data_objects
timestamps                                                                  
2015-01-01 01:00:00-06:00  [[[0.0, 290.43628, 0.0, 73.0, 101516.0, 295.77...
2015-01-01 02:00:00-06:00  [[[0.0, 290.8083, 0.0, 74.0, 101502.0, 295.740...
2015-01-01 03:00:00-06:00  [[[0.0, 290.99026, 0.0, 75.0, 101467.0, 295.66...
2015-01-01 04:00:00-06:00  [[[0.0, 290.83884, 0.0, 75.0, 101436.0, 295.67...
2015-01-01 05:00:00-06:00  [[[0.0, 290.83582, 0.0, 75.0, 101412.0, 295.64...
                                                                data_objects
timestamps                                                                  
2018-12-31 17:00:00-06:00  [[[0.0, 283.83853, 268.59998, 58.5307, 101409....
2018-12-31 18:00:00-06:00  [[[0.0, 284.02292, 77.4, 58.640553, 101367.984...
2018-12-31 19:00:00-06:00  [[[0.0, 284.31033, 51.6, 59.009323, 101398.63,...
2018-12-31 20:00:00-06:00  [[[0.0, 284.59775, 25.8, 59.378094, 101429.28,...

In [4]:
# Obtain the ERCOT data (reduced hourly and split into train/test in a 24:1 ratio)

training_expected_output_filename = "ercot_for_nam_training"
testing_expected_output_filename = "ercot_for_nam_testing"

# If we haven't already done this split, do it now and save the contents to disk
if not path.exists(cleaned_data + training_expected_output_filename + extension) or not path.exists(cleaned_data + testing_expected_output_filename + extension):
    print("cleaning ERCOT data for NAM purposes")
    # Get generically cleaned ERCOT data, not specifically set up for use with NAM
    ercot_wind_power = pd.read_pickle("ercot_wind_power.pkl")
 
    # ERCOT data is every 15 minutes, while the NAM data is hourly
    # We therefore only use the ERCOT data on the hour, when working with NAM
    hourly_ercot = ercot_wind_power[::4]

    # Small change: NAM ends at 21:00 on Dec 31 2018, while ERCOT ends at 23:00
    # So cut out the last two entries of hourly_ercot
    hourly_ercot = hourly_ercot.iloc[:-2]

    # Split the ERCOT data to line up with the NAM data
    hourly_ercot['numerical_index'] = range(0, len(hourly_ercot))
    ercot_training_df = hourly_ercot[hourly_ercot['numerical_index'] % 25 != 0]
    ercot_testing_df = hourly_ercot[hourly_ercot['numerical_index'] % 25 == 0]
    del hourly_ercot
    gc.collect()
    ercot_training_df = ercot_training_df.drop('numerical_index', 1)
    ercot_testing_df = ercot_testing_df.drop('numerical_index', 1)
    
    # Save the ERCOT data to file
    ercot_training_df.to_pickle(cleaned_data + training_expected_output_filename + extension, compression='gzip')
    ercot_testing_df.to_pickle(cleaned_data + testing_expected_output_filename + extension, compression='gzip')
    del ercot_training_df
    del ercot_testing_df
    gc.collect()

# Get the NAM-specific ERCOT data from disk
ercot_training_df = pd.read_pickle(cleaned_data + training_expected_output_filename + extension, compression='gzip')

In [5]:
# Print the training dataset out
pprint(ercot_training_df.iloc[:5,:3])
pprint(ercot_training_df.iloc[-5:,:3])

resource_code              ANACACHO_ANA  ASTRA_UNIT1  BCATWIND_WIND_1
timestamp                                                            
2015-01-01 01:00:00-06:00           0.0          NaN              0.0
2015-01-01 02:00:00-06:00           0.0          NaN              0.0
2015-01-01 03:00:00-06:00           0.0          NaN              0.0
2015-01-01 04:00:00-06:00           0.0          NaN              0.0
2015-01-01 05:00:00-06:00           0.0          NaN              0.0
resource_code              ANACACHO_ANA  ASTRA_UNIT1  BCATWIND_WIND_1
timestamp                                                            
2018-12-31 17:00:00-06:00      0.000000     33.30522         11.35381
2018-12-31 18:00:00-06:00      0.000000     36.93887         12.76649
2018-12-31 19:00:00-06:00      0.000000     38.98851         23.02475
2018-12-31 20:00:00-06:00      0.582387     39.37746         34.87904
2018-12-31 21:00:00-06:00      7.234724     39.16596         33.18754


In [6]:
# Flatten each 140*140*9 array, and stack those to create a matrix of data points
# Then train on that + the vector of ERCOT data values for a single farm

from sklearn import linear_model

# Can't call method in apply, so instead have a wrapper function to call the method
def flatten_wrapper(np_array):
    return np_array.flatten()

def reshape_regression_data(df):
    regression_data = df['data_objects'].apply(flatten_wrapper).values
    regression_tuple = tuple(regression_data)
    del regression_data
    gc.collect()
    nam_matrix = np.stack(regression_tuple, axis = 0)
    del regression_tuple
    gc.collect()
    return nam_matrix

def reshape_other_data(other_data):
    other_data = other_data.values.reshape(other_data.shape[0], 1)
    return other_data

def run_linear_regression(data1, data2):
    lm = linear_model.LinearRegression()
    model = lm.fit(data1, data2)   
    return model

In [None]:
# Reshape the training dataframe
reshaped_nam_training = reshape_regression_data(nam_training_df)
# Delete the original Pandas dataframe from memory
del nam_training_df
gc.collect()

hi
yo
hey
wassup


In [None]:
# Reshape the other training dataframe
reshaped_ercot_training_0 =  reshape_other_data(ercot_training_df.iloc[:,0])
# Delete the original Pandas dataframe from memory
del ercot_training_df
gc.collect()

In [None]:
nam_training_model = run_linear_regression(reshaped_nam_training, reshaped_ercot_training_0)
del reshaped_nam_training
del reshaped_ercot_training_0
gc.collect()

In [None]:
print(nam_training_model.coef_)

In [None]:
# Get the NAM testing information
nam_testing_df = None
for i in range(32):
    testing_part = pd.read_pickle(cleaned_data + testing_data_filename + str(i) + extension, compression='gzip')
    nam_testing_df = testing_part if nam_testing_df is None else nam_testing_df.append(testing_part)
del testing_part
gc.collect()

In [None]:
pprint(nam_testing_df[:5])
pprint(nam_testing_df[-5:])

In [None]:
# Get the ERCOT testing information
ercot_testing_df = pd.read_pickle(cleaned_data + testing_expected_output_filename + extension, compression='gzip')

In [None]:
pprint(ercot_testing_df.iloc[:5,:3])
pprint(ercot_testing_df.iloc[-5:,:3])

In [None]:
# Reshape the training dataframe
reshaped_nam_testing = reshape_regression_data(nam_testing_df)
# Delete the original Pandas dataframe from memory
del nam_testing_df
gc.collect()

In [None]:
# Reshape the training dataframe
reshaped_ercot_testing_0 = reshape_other_data(ercot_testing_df.iloc[:,0])
# Delete the original Pandas dataframe from memory
del ercot_testing_df
gc.collect()

In [None]:
print(nam_training_model.score(reshaped_nam_testing, reshaped_ercot_testing_0))
del reshaped_nam_testing
del reshaped_ercot_testing_0
gc.collect()

In [2]:
import pickle
# Save the linear regression model to a file
filename = 'nam_basic_regression_model.sav'
pickle.dump(nam_training_model, open(filename, 'wb'))
# to later load the model from disk, run:
# loaded_model = pickle.load(open(filename, 'rb'))

NameError: name 'nam_training_model' is not defined