In [None]:
import pickle
import numpy as np
import pandas as pd

In [7]:
# First load dictionary with all data
with open('../data/subx_all_data.pickle', 'rb') as f:
    data = pickle.load(f)

In [8]:
# print keys
print(data.keys())

x, y, rand_inds = data['x'], data['y'], data['rand_inds']

'''
X is the 3 day (overlapping) average ensemble rainfall predictions.
Before use w/ proposed method I computed its log+1 transform (i.e. x = log(1 + x) )
and also standardized. The data in the pickle has not been log-transformed or standardized.
Dimensions of x are:
    # of weekly model simulations (939)
    # of models in ensemble (11)
    numerical model rainfall predictions for 7 day averages
    spatial height (29)
    spatial width (59)
'''
print('x shape: ', x.shape)

'''
Y is the observed 3 day average rainfall 10 days in advance.
No further transformations were made before use w/ proposed method.
Dimensions of y are:
    # of weekly model simulations (939)
    ignore this axis
    spatial height (29)
    spatial width (59)
'''
print('y shape: ', y.shape)


# Compute ensemble mean
ens_mean = np.nanmean(x[:, :, -1], axis=1)
# Print MSE of ensemble mean
print('Ensemble mean error: ', np.nanmean(np.square(ens_mean - y[:, 0])))

dict_keys(['rand_inds', 'y', 'x'])
x shape:  (939, 11, 7, 29, 59)
y shape:  (939, 1, 29, 59)
Ensemble mean error:  15.622858


In [9]:
# Randomly shuffle x and y according to the 4th seed
seed = 4
shuffled_x = x[rand_inds[:, seed]]
shuffled_y = y[rand_inds[:, seed]]

In [10]:
# Divide into train, validation and test sets
train_size = 450
val_size = 250
test_size = shuffled_x.shape[0] - train_size - val_size
x_train, x_val, x_test = shuffled_x[:train_size], shuffled_x[train_size:train_size + val_size], shuffled_x[-test_size:]
y_train, y_val, y_test = shuffled_y[:train_size], shuffled_y[train_size:train_size + val_size], shuffled_y[-test_size:]


print('training data: ', x_train.shape, y_train.shape)
print('validation data: ', x_val.shape, y_val.shape)
print('testing data: ', x_test.shape, y_test.shape)

training data:  (450, 11, 7, 29, 59) (450, 1, 29, 59)
validation data:  (250, 11, 7, 29, 59) (250, 1, 29, 59)
testing data:  (239, 11, 7, 29, 59) (239, 1, 29, 59)


In [12]:
df = pd.DataFrame(y_test.flatten())
df.describe()

Unnamed: 0,0
count,221356.0
mean,2.054091
std,4.032431
min,0.0
25%,0.008871
50%,0.417186
75%,2.333072
max,125.454285
