### Imports and Paths

In [None]:
#imports
#from google.colab import drive
from scipy.io import loadmat
import numpy as np
import math
import os
import time
import datetime
import itertools
import h5py


cwd = os.getcwd()
pardir = os.path.dirname(cwd)
data_dir = os.path.join(pardir,"data/")

#paths
#drive.mount('/content/drive')
#data_dir = "/content/drive/MyDrive/Stanford-solar-forecasting-dataset/data/"
data_path = os.path.join(data_dir, "2017_2019_images_pv_processed.hdf5") #forecast_dataset
forecast_data_path = os.path.join(data_dir, "2017_2019_images_pv_processed_forecast.hdf5") #forecast_dataset

print(data_path)
print(forecast_data_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import test data


In [None]:
print("Starting test data import...")

#import the test ground truth
times_test = np.load(data_dir+"times_test.npy", allow_pickle=True)

with h5py.File(data_path,'r') as f:

    # read in the data
    images_log_test = f['test']['images_log'][...]
    pv_log_test = f['test']['pv_log'][...]
    sun_pos_log_test = f['test']['sun_pos'][...]
    sampi_log_test = f['test']['sampi'][...]

# process image data
images_log_test = images_log_test.astype('float32')
pv_log_test = pv_log_test.astype('float32')
sun_pos_log_test = sun_pos_log_test.astype('float32')
sampi_log_test = sampi_log_test.astype('float32')
#reshape sampi
sampi_log_test = sampi_log_test.reshape(-1)


print("times test shape:", times_test.shape)
print("test images log shape:", images_log_test.shape)
print("test sun pos log shape:", sun_pos_log_test.shape)
print("test sampi log shape:", sampi_log_test.shape)
print("test pv log shape:", pv_log_test.shape)



print('All test data laoded...')

Starting test data import...
times test shape: (14003,)
test images log shape: (14003, 64, 64, 3)
test sun pos log shape: (14003, 2)
test sampi log shape: (14003,)
test pv log shape: (14003,)
All test data laoded...


In [None]:
# get all dates from test
dates_test = np.asarray([times.date() for times in times_test])
all_dates_test = np.unique(dates_test)


## test data

In [None]:
#add new forecast test times
for d, date in enumerate(all_dates_test):
    date_mask = (dates_test == date)

    date_times_test = times_test[date_mask]

    forecast_date_dates_test = date_times_test[::2]

    if d==0:
        forecast_times_test = forecast_date_dates_test
    else:
        forecast_times_test =  np.concatenate((forecast_times_test, forecast_date_dates_test), axis=0)


np.save(data_dir+"forecast_times_test.npy",forecast_times_test)

In [None]:
update_rate = 2
sampling_freq = 1
forecast_lead_time = 15

In [None]:
## creating pv, sun pos sampi and images log test and pv pred test for forecasting task

# for the entries in the beginning of the day
empty_fifteen_images = np.zeros((forecast_lead_time, 64, 64, 3), dtype='float32')
empty_fifteen_pv = np.zeros((forecast_lead_time), dtype='float32')
empty_fifteen_sun_pos = np.zeros((forecast_lead_time, 2), dtype='float32')
empty_fifteen_sampi = np.zeros((forecast_lead_time), dtype='float32')



for d, date in enumerate(all_dates_test):
    print("date ", d)
    date_mask = (dates_test == date)

    #get all data from date
    images_date = images_log_test[date_mask]
    pv_date = pv_log_test[date_mask]
    sun_pos_date = sun_pos_log_test[date_mask]
    sampi_date = sampi_log_test[date_mask]

    #creating variable for forecast format for date
    len_date_update = math.ceil(images_date.shape[0]/update_rate)
    forecast_date_images_log = np.empty((len_date_update, forecast_lead_time+1, 64, 64, 3), dtype='float32')
    forecast_date_pv_log = np.empty((len_date_update, forecast_lead_time+1), dtype='float32')
    forecast_date_sun_pos_log = np.empty((len_date_update, forecast_lead_time+1, 2), dtype='float32')
    forecast_date_sampi_log = np.empty((len_date_update, forecast_lead_time+1), dtype='float32')
    forecast_date_pv_pred = np.empty((len_date_update), dtype='float32')

    #add the 15 empty entries in the beginning for log data
    images_date_log = np.concatenate((empty_fifteen_images, images_date), axis=0)
    pv_date_log = np.concatenate((empty_fifteen_pv, pv_date), axis=0)
    sun_pos_date_log = np.concatenate((empty_fifteen_sun_pos, sun_pos_date), axis=0)
    sampi_date_log = np.concatenate((empty_fifteen_sampi, sampi_date), axis=0)

    #add the 15 empty entries in the end for pred data
    pv_date_pred = np.concatenate((pv_date, empty_fifteen_pv), axis=0)

    #creating new pv pred
    # getting every second element from forecast_lead_time to the end of array
    forecast_date_pv_pred = pv_date_pred[forecast_lead_time::2]


    # creating new date images, pv log, sun pos log and sampi log
    for i in range(images_date_log.shape[0]-forecast_lead_time):
      if i//update_rate:
        i_forecast = int(i/update_rate)
        forecast_date_images_log[i_forecast] = images_date_log[i:(i+forecast_lead_time+1)]
        forecast_date_pv_log[i_forecast] = pv_date_log[i:(i+forecast_lead_time+1)]
        forecast_date_sun_pos_log[i_forecast] = sun_pos_date_log[i:(i+forecast_lead_time+1)]
        forecast_date_sampi_log[i_forecast] = sampi_date_log[i:(i+forecast_lead_time+1)]


    # concatenating all the dates
    if d==0:
        forecast_images_log_test = forecast_date_images_log
        forecast_pv_log_test = forecast_date_pv_log
        forecast_sun_pos_log_test = forecast_date_sun_pos_log
        forecast_sampi_log_test = forecast_date_sampi_log

        forecast_pv_pred_test = forecast_date_pv_pred
    else:
        forecast_images_log_test = np.concatenate((forecast_images_log_test, forecast_date_images_log), axis=0)
        forecast_pv_log_test = np.concatenate((forecast_pv_log_test, forecast_date_pv_log), axis=0)
        forecast_sun_pos_log_test = np.concatenate((forecast_sun_pos_log_test, forecast_date_sun_pos_log), axis=0)
        forecast_sampi_log_test = np.concatenate((forecast_sampi_log_test, forecast_date_sampi_log), axis=0)

        forecast_pv_pred_test = np.concatenate((forecast_pv_pred_test, forecast_date_pv_pred), axis=0)

date  0
date  1
date  2
date  3
date  4
date  5
date  6


KeyboardInterrupt: ignored

In [None]:
with h5py.File(forecast_data_path,'a') as f:
    del f['test']

with h5py.File(forecast_data_path,'a') as f:
    test = f.create_group('test')
    test.create_dataset('images_log', forecast_images_log_test.shape, dtype='f')
    test.create_dataset('pv_log', forecast_pv_log_test.shape, dtype='f')
    test.create_dataset('sun_pos_log', forecast_sun_pos_log_test.shape, dtype='f')
    test.create_dataset('sampi_log', forecast_sampi_log_test.shape, dtype='f')

    test.create_dataset('pv_pred', forecast_pv_pred_test.shape, dtype='f')

    f['test']['images_log'][...] = forecast_images_log_test
    f['test']['pv_log'][...] = forecast_pv_log_test
    f['test']['sun_pos_log'][...] = forecast_sun_pos_log_test
    f['test']['sampi_log'][...] = forecast_sampi_log_test

    f['test']['pv_pred'][...] = forecast_pv_pred_test


print('test data written...')

In [None]:
print("forecast times test shape:", forecast_times_test.shape)
print("test images log shape:", forecast_images_log_test.shape)
print("test sun pos log shape:", forecast_sun_pos_log_test.shape)
print("test sampi log shape:", forecast_sampi_log_test.shape)
print("test pv log shape:", forecast_pv_log_test.shape)
print("test pv pred shape:", forecast_pv_pred_test.shape)

## Import trainval data


In [None]:
print("Starting trainval data import...")

#import the trainval ground truth
times_trainval = np.load(data_dir+"times_trainval.npy", allow_pickle=True)

with h5py.File(data_path,'r') as f:

    # read in the data
    images_log_trainval = f['trainval']['images_log'][...]
    pv_log_trainval = f['trainval']['pv_log'][...]
    sun_pos_log_trainval = f['trainval']['sun_pos'][...]
    sampi_log_trainval = f['trainval']['sampi'][...]

# process image data
images_log_trainval = images_log_trainval.astype('float32')
pv_log_trainval = pv_log_trainval.astype('float32')
sun_pos_log_trainval = sun_pos_log_trainval.astype('float32')
sampi_log_trainval = sampi_log_trainval.astype('float32')
#reshape sampi
sampi_log_trainval = sampi_log_trainval.reshape(-1)

print('All trainval data laoded...')

In [None]:
# get all dates from trainval
dates_trainval = np.asarray([times.date() for times in times_trainval])
all_dates_trainval = np.unique(dates_trainval)

## trainval data

In [None]:
#add new forecast trainval times
for d, date in enumerate(all_dates_trainval):
    date_mask = (dates_trainval == date)

    date_times_trainval = times_trainval[date_mask]

    forecast_date_dates_trainval = date_times_trainval[::2]

    if d==0:
        forecast_times_trainval = forecast_date_dates_trainval
    else:
        forecast_times_trainval =  np.concatenate((forecast_times_trainval, forecast_date_dates_trainval), axis=0)


np.save(data_dir+"forecast_times_trainval.npy",forecast_times_trainval)


In [None]:
update_rate = 2
sampling_freq = 1
forecast_lead_time = 15

In [None]:
## creating pv, sun pos sampi and images log trainval and pv pred trainval for forecasting task

# for the entries in the beginning of the day
empty_fifteen_images = np.zeros((forecast_lead_time, 64, 64, 3), dtype='float32')
empty_fifteen_pv = np.zeros((forecast_lead_time), dtype='float32')
empty_fifteen_sun_pos = np.zeros((forecast_lead_time, 2), dtype='float32')
empty_fifteen_sampi = np.zeros((forecast_lead_time), dtype='float32')



for d, date in enumerate(all_dates_trainval):
    print("date ", d)
    date_mask = (dates_trainval == date)

    #get all data from date
    images_date = images_log_trainval[date_mask]
    pv_date = pv_log_trainval[date_mask]
    sun_pos_date = sun_pos_log_trainval[date_mask]
    sampi_date = sampi_log_trainval[date_mask]

    #creating variable for forecast format for date
    len_date_update = math.ceil(images_date.shape[0]/update_rate)
    forecast_date_images_log = np.empty((len_date_update, forecast_lead_time+1, 64, 64, 3), dtype='float32')
    forecast_date_pv_log = np.empty((len_date_update, forecast_lead_time+1), dtype='float32')
    forecast_date_sun_pos_log = np.empty((len_date_update, forecast_lead_time+1, 2), dtype='float32')
    forecast_date_sampi_log = np.empty((len_date_update, forecast_lead_time+1), dtype='float32')
    forecast_date_pv_pred = np.empty((len_date_update), dtype='float32')

    #add the 15 empty entries in the beginning for log data
    images_date_log = np.concatenate((empty_fifteen_images, images_date), axis=0)
    pv_date_log = np.concatenate((empty_fifteen_pv, pv_date), axis=0)
    sun_pos_date_log = np.concatenate((empty_fifteen_sun_pos, sun_pos_date), axis=0)
    sampi_date_log = np.concatenate((empty_fifteen_sampi, sampi_date), axis=0)

    #add the 15 empty entries in the end for pred data
    pv_date_pred = np.concatenate((pv_date, empty_fifteen_pv), axis=0)

    # creating pv pred
    # getting every second element from forecast_lead_time to the end of array
    forecast_date_pv_pred = pv_date_pred[forecast_lead_time::2]


    # creating new date images and pv log
    for i in range(images_date_log.shape[0]-forecast_lead_time):
      if i//update_rate:
        i_forecast = int(i/update_rate)
        forecast_date_images_log[i_forecast] = images_date_log[i:(i+forecast_lead_time+1)]
        forecast_date_pv_log[i_forecast] = pv_date_log[i:(i+forecast_lead_time+1)]
        forecast_date_sun_pos_log[i_forecast] = sun_pos_date_log[i:(i+forecast_lead_time+1)]
        forecast_date_sampi_log[i_forecast] = sampi_date_log[i:(i+forecast_lead_time+1)]


    # concatenating all the dates
    if d==0:
        forecast_images_log_trainval = forecast_date_images_log
        forecast_pv_log_trainval = forecast_date_pv_log
        forecast_sun_pos_log_trainval = forecast_date_sun_pos_log
        forecast_sampi_log_trainval = forecast_date_sampi_log

        forecast_pv_pred_trainval = forecast_date_pv_pred
    else:
        forecast_images_log_trainval = np.concatenate((forecast_images_log_trainval, forecast_date_images_log), axis=0)
        forecast_pv_log_trainval = np.concatenate((forecast_pv_log_trainval, forecast_date_pv_log), axis=0)
        forecast_sun_pos_log_trainval = np.concatenate((forecast_sun_pos_log_trainval, forecast_date_sun_pos_log), axis=0)
        forecast_sampi_log_trainval = np.concatenate((forecast_sampi_log_trainval, forecast_date_sampi_log), axis=0)

        forecast_pv_pred_trainval = np.concatenate((forecast_pv_pred_trainval, forecast_date_pv_pred), axis=0)

In [None]:
with h5py.File(forecast_data_path,'a') as f:
    del f['trainval']

with h5py.File(forecast_data_path,'a') as f:
    trainval = f.create_group('trainval')
    trainval.create_dataset('images_log', forecast_images_log_trainval.shape, dtype='f')
    trainval.create_dataset('pv_log', forecast_pv_log_trainval.shape, dtype='f')
    trainval.create_dataset('sun_pos_log', forecast_sun_pos_log_trainval.shape, dtype='f')
    trainval.create_dataset('sampi_log', forecast_sampi_log_trainval.shape, dtype='f')

    trainval.create_dataset('pv_pred', forecast_pv_pred_trainval.shape, dtype='f')

    f['trainval']['images_log'][...] = forecast_images_log_trainval
    f['trainval']['pv_log'][...] = forecast_pv_log_trainval
    f['trainval']['sun_pos_log'][...] = forecast_sun_pos_log_trainval
    f['trainval']['sampi_log'][...] = forecast_sampi_log_trainval

    f['trainval']['pv_pred'][...] = forecast_pv_pred_trainval


print('trainval data written...')


In [None]:
print("forecast times trainval shape:", forecast_times_trainval.shape)
print("trainval images log shape:", forecast_images_log_trainval.shape)
print("trainval sun pos log shape:", forecast_sun_pos_log_trainval.shape)
print("trainval sampi log shape:", forecast_sampi_log_trainval.shape)
print("trainval pv log shape:", forecast_pv_log_trainval.shape)
print("trainval pv pred shape:", forecast_pv_pred_trainval.shape)