# Weather --> CNN --> LSTM --> Streamflow

# Preamble

In [1]:
#first: are you working in colab?
colab = 1

if colab:
    
    #mount drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    #define path to google drive data
    dataPath = '/content/drive/My Drive/Colab Notebooks/T_P_F_pca_lstm/'

    #download required libraries that are not already in colab
    !pip install geopandas
    
else:
    
    dataPath = ''

ModuleNotFoundError: No module named 'google.colab'

In [None]:
#import required libraries

import pandas as pd
import numpy as np
#from netCDF4 import Dataset
import keras
from keras.layers import Conv2D, MaxPooling2D, Dense, LSTM, Flatten, TimeDistributed, Dropout, Input
from keras.models import Sequential, load_model, Model
from keras.callbacks import EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
from keras import Model, Sequential, regularizers
import keras
import pickle
import matplotlib.pyplot as plt
import geopandas as gpd
from sklearn.cluster import KMeans
from scipy import interpolate
import time
import seaborn as sns

In [None]:
#define functions that we'll use

def nse(y_obs, y_model):

  """
  NSE = nse(y_obs, y_model)

  y_obs, y_model --> these are arrays of the same length (1 x N or N x1) where N is the number of observations in time
  """

  y_model = y_model.reshape((-1,1))
  y_obs = y_obs.reshape((-1,1))
  nse = 1 - np.sum((y_model - y_obs)**2) / np.sum((y_obs - np.mean(y_obs))**2)
  return nse

def nse_rolling(y_obs, y_model, window, stride = 1):

  """
  NSE_rolling = nse_rolling(y_obs, y_model, window, stride)

  y_obs, y_model --> these are arrays of the same length (1 x N or N x 1) where N is the number of observations in time
  window --> this is the length of time over which to compute NSE, which will roll accross the total time period
  stride --> default stride = 1; length of step to take when rolling (i.e. stride = 365 computes yearly NSE with no overlap)
  """

  NSE_rolling = []

  y_model = y_model.reshape((-1,1))
  y_obs = y_obs.reshape((-1,1))

  startInds = range(0, len(y_model) - window, stride)
  for startInd in startInds:
    y_model_window = y_model[startInd:startInd+window] 
    y_obs_window = y_obs[startInd:startInd+window]
    NSE = nse(y_obs_window, y_model_window)
    NSE_rolling.append(NSE)

  return NSE_rolling

def plot_AB(prov='AB'):

    """
    plot borders of alberta
    
    example:
    import geopandas as gpd
    import matplotlib.pyplot as plt
    plot_AB()
    plt.show()
    """
    
    provIndex=0
    provshapes_filename = '/content/drive/My Drive/Colab Notebooks/cnn_lstm_era/PROVINCE.SHP'
    provshapes = gpd.read_file(provshapes_filename)
    provPoly = provshapes['geometry'][provIndex]
    lonBorder,latBorder = provPoly.exterior.coords.xy 

    plt.plot(lonBorder,latBorder,'k')

# Load data and preprocess

In [None]:
#load data

pickle_in = open(dataPath + 'flowDict.pickle','rb')
flowDict = pickle.load(pickle_in)

pickle_in = open(dataPath + 'tempDict_AB_ERA5.pickle','rb')
tempDict = pickle.load(pickle_in)

pickle_in = open(dataPath + 'precDict_AB_ERA5.pickle','rb')
precDict = pickle.load(pickle_in)

pickle_in = open(dataPath + 'spcHDict_AB_ERA5.pickle','rb')
spcHDict = pickle.load(pickle_in)

pickle_in = open(dataPath + 'ssrdDict_AB_ERA5.pickle','rb')
ssrdDict = pickle.load(pickle_in)

#unpack data

stationLat = flowDict['stationLat']
stationLon = flowDict['stationLon']
eraLat = tempDict['latERA']
eraLon = tempDict['lonERA']

flowDays = flowDict['windowDays']
flowMonths = flowDict['windowMonths']
flowYears = flowDict['windowYears']
eraDays = tempDict['daysERA']
eraMonths = tempDict['monthsERA']
eraYears = tempDict['yearsERA']

F = flowDict['all_flowwindow_norm_NF'] #normalized discharge with nans filled (NF)
T = tempDict['T']
Tmax = tempDict['Tmax']
Tmin = tempDict['Tmin']
P = precDict['P']
H = spcHDict['H']
S = ssrdDict['S']

In [None]:
#set nan values to mean values of field (ie: out of province values

meanT = np.nanmean(T)
meanTmax = np.nanmean(Tmax)
meanTmin = np.nanmean(Tmin)
meanP = np.nanmean(P)
meanH = np.nanmean(H)
meanS = np.nanmean(S)

Tall = np.copy(T)
Tall[np.where(np.isnan(Tall))] = np.nanmean(T)
T = Tall

Tmaxall = np.copy(Tmax)
Tmaxall[np.where(np.isnan(Tmaxall))] = np.nanmean(Tmax)
Tmax = Tmaxall

Tminall = np.copy(Tmin)
Tminall[np.where(np.isnan(Tminall))] = np.nanmean(Tmin)
Tmin = Tminall

Pall = np.copy(P)
Pall[np.where(np.isnan(Pall))] = np.nanmean(P)
P = Pall

Hall = np.copy(H)
Hall[np.where(np.isnan(Hall))] = np.nanmean(H)
H = Hall

Sall = np.copy(S)
Sall[np.where(np.isnan(Sall))] = np.nanmean(S)
S = Sall

In [None]:
#make data have same time range
startYear = max(int(np.min(eraYears)),int(np.min(flowYears)))

indStartERA = min(np.argwhere(eraYears==startYear))[0]
indStartFlow = min(np.argwhere(flowYears==startYear))[0]

F = np.asarray(np.transpose(np.squeeze(F[indStartFlow:])))
T = np.asarray(T[indStartERA:])
Tmax = np.asarray(Tmax[indStartERA:])
Tmin = np.asarray(Tmin[indStartERA:])
P = np.asarray(P[indStartERA:])
H = np.asarray(H[indStartERA:])
S = np.asarray(S[indStartERA:])

##just alberta
#T = T[:,:15,29:]
#P = P[:,:15,29:]
#H = H[:,:15,28:]
#S = S[:,:15,28:]

flowDays = flowDays[indStartFlow:]
flowMonths = flowMonths[indStartFlow:]
flowYears = flowYears[indStartFlow:]

eraDays = eraDays[indStartERA:]
eraMonths = eraMonths[indStartERA:]
eraYears = eraYears[indStartERA:]

In [None]:
print(np.shape(F),np.shape(T),np.shape(P),np.shape(H),np.shape(S))

In [None]:
#prep data: standardize

#indices of testing/training
trainStartYear = 1987
trainFinYear = 2005
testStartYear = 2006
testFinYear = 2010

trainInds = np.squeeze(np.argwhere((flowYears>=trainStartYear) & (flowYears<=trainFinYear)))
testInds = np.squeeze(np.argwhere((flowYears>=testStartYear) & (flowYears<=testFinYear)))

#standardize variables individually (normalize wrt training period), then save as 32-bit rather than 64-bit for space
Tmean_train = np.mean([T[trainInds[ii]] for ii in range(len(trainInds))])
Tstd_train = np.std([T[trainInds[ii]] for ii in range(len(trainInds))])
Tnorm = (T - Tmean_train)/Tstd_train
Tnorm = np.single(Tnorm)

Tmaxmean_train = np.mean([Tmax[trainInds[ii]] for ii in range(len(trainInds))])
Tmaxstd_train = np.std([Tmax[trainInds[ii]] for ii in range(len(trainInds))])
Tmaxnorm = (Tmax - Tmaxmean_train)/Tmaxstd_train
Tmaxnorm = np.single(Tmaxnorm)

Tminmean_train = np.mean([Tmin[trainInds[ii]] for ii in range(len(trainInds))])
Tminstd_train = np.std([Tmin[trainInds[ii]] for ii in range(len(trainInds))])
Tminnorm = (Tmin - Tminmean_train)/Tminstd_train
Tminnorm = np.single(Tnorm)

Pmean_train = np.mean([P[trainInds[ii]] for ii in range(len(trainInds))])
Pstd_train = np.std([P[trainInds[ii]] for ii in range(len(trainInds))])
Pnorm = (P - Pmean_train)/Pstd_train
Pnorm = np.single(Pnorm)

Hmean_train = np.mean([H[trainInds[ii]] for ii in range(len(trainInds))])
Hstd_train = np.std([H[trainInds[ii]] for ii in range(len(trainInds))])
Hnorm = (H - Hmean_train)/Hstd_train
Hnorm = np.single(Hnorm)

Smean_train = np.mean([S[trainInds[ii]] for ii in range(len(trainInds))])
Sstd_train = np.std([S[trainInds[ii]] for ii in range(len(trainInds))])
Snorm = (S - Smean_train)/Sstd_train
Snorm = np.single(Snorm)

#Fmean_train = np.nanmean([F[ii][trainInds[366:]] for ii in range(len(F))])
#Fstd_train = np.nanstd([F[ii][trainInds[366:]] for ii in range(len(F))])
Fmean_train = np.mean(F[trainInds[366:],:])
Fstd_train = np.std(F[trainInds[366:],:])
Fnorm = (F - Fmean_train)/Fstd_train
Fnorm = np.single(Fnorm)

In [None]:
#normalize flow

for station in range(np.shape(F)[1]):
    #F[:,station] = (F[:,station] - np.mean(F[:,station]))/np.std(F[:,station])
    minF = np.min(F[:,station])
    maxF = 2 * np.std(F[:,station])
    F[:,station] = (F[:,station] - minF) / (maxF - minF)
    
#for inds in np.argwhere(F>10):
#  F[inds[0],inds[1]] = 10

In [None]:
#construct train and test predictor/target tensors

#target data
y_train = np.squeeze([Fnorm[366:len(trainInds)+366,ii] for ii in range(np.shape(F)[1])])
y_test = np.squeeze([Fnorm[testInds[1:],ii] for ii in range(np.shape(F)[1])])

#first, make (n_time x n_lon x n_lat x n_vars) tensor 
#x_intermediate = np.zeros((8766,17,43,2))
x_intermediate = np.empty((8766,15,14,4),dtype='single')
x_intermediate[:,:,:,0] = Tnorm
x_intermediate[:,:,:,1] = Pnorm
x_intermediate[:,:,:,2] = Hnorm
x_intermediate[:,:,:,3] = Snorm
x_train_intermediate = x_intermediate[trainInds]
x_test_intermediate = x_intermediate[testInds]

In [None]:
#now, convert x_intermediate into (n_time x 365 x n_lon x n_lat x n_vars) tensor
x = np.empty((8766-365,365,15,14,4),dtype='single')
x_train = np.empty((len(trainInds),365,15,14,4),dtype='single')
x_test = np.empty((len(testInds)-1,365,15,14,4),dtype='single')

In [None]:
for ii in range(1000):
    x_train[ii] = x_intermediate[ii:ii+365]

In [None]:
for ii in range(1000,2000):
    x_train[ii] = x_intermediate[ii:ii+365]

In [None]:
for ii in range(2000,3000):
    x_train[ii] = x_intermediate[ii:ii+365]

In [None]:
for ii in range(3000,4000):
    x_train[ii] = x_intermediate[ii:ii+365]

In [None]:
for ii in range(4000,len(trainInds)):
    x_train[ii] = x_intermediate[ii:ii+365]

In [None]:
for ii in range(1000):
    x_test[ii] = x_intermediate[ii+len(trainInds)-365:ii+len(trainInds)]

In [None]:
for ii in range(1000,2000):
    x_test[ii] = x_intermediate[ii+len(trainInds)-365:ii+len(trainInds)]

In [None]:
for ii in range(2000,len(testInds)-1):
    x_test[ii] = x_intermediate[ii+len(trainInds)-365:ii+len(trainInds)]

In [None]:
#CNN model
print('Building model...')
model = Sequential()
model.add(TimeDistributed(
    Conv2D(filters = 8, kernel_size = (3,3), activation='relu',data_format='channels_last', padding='same'), 
    input_shape=(365,15,14,4)))
model.add(TimeDistributed(
    Conv2D(filters = 8, kernel_size = (3,3), activation='relu',data_format='channels_last', padding='same'), 
    input_shape=(365,15,14,4)))
model.add(TimeDistributed(MaxPooling2D(pool_size = 2)))
model.add(TimeDistributed(
    Conv2D(filters = 16, kernel_size = (2,2), activation='relu',data_format='channels_last', padding='same'), 
    ))
model.add(TimeDistributed(
    Conv2D(filters = 16, kernel_size = (2,2), activation='relu',data_format='channels_last', padding='same'), 
    ))
model.add(TimeDistributed(MaxPooling2D(pool_size = 2)))
model.add(TimeDistributed(
    Conv2D(filters = 32, kernel_size = (2,2), activation='relu',data_format='channels_last', padding='same'), 
    ))
model.add(TimeDistributed(
    Conv2D(filters = 32, kernel_size = (2,2), activation='relu',data_format='channels_last', padding='same'), 
    ))
#model.add(TimeDistributed(MaxPooling2D(pool_size = 2)))
model.add(TimeDistributed(Flatten()))

#LSTM model with time-distributed CNN as input
#model.add(LSTM(units = 40, activation='tanh', recurrent_activation='hard_sigmoid', 
#               use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', 
#               bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, 
#               recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, 
#               kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, implementation=1, 
#               return_sequences=True, return_state=False))
#model.add(LSTM(units = 40, activation='tanh', recurrent_activation='hard_sigmoid', 
#               use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', 
#               bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, 
#               recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, 
#               kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, implementation=1, 
#               return_sequences=False, return_state=False))
#model.add(Dense(194, activation = 'relu'))

model.add(LSTM(40, return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(LSTM(40, return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(LSTM(40, return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(Flatten())
model.add(Dense(194, activation = 'linear'))

#compile
print('Compiling model...')
model.compile(loss=keras.losses.MSE,
              optimizer=keras.optimizers.Adam(lr=0.005),
              metrics=['mae'])

model.summary()

es = EarlyStopping(monitor='val_loss', 
                   mode='min', 
                   verbose=1, 
                   patience = 3)

In [None]:
#train model

trainModel = 1

batch_size = 256
epochs = 40

if trainModel == 1:

  history = model.fit(x_train,y_train, 
                      shuffle = True, 
                      epochs = epochs, 
                      batch_size = batch_size,
                      verbose = 1, 
                      callbacks = [es])

In [None]:
#save model

saveModel = 0
glacierStations = 0

if saveModel == 1:

  if glacierStations == 1:
    modelName = str(num_stations) + '_stations_' + 'CNN_LSTM_DENSE_glacierStations_' + str(epochs) + '_epochs'
    model.save(modelName + '.h5')
  else:
    modelName = str(num_stations) + '_stations_' + 'CNN_LSTM_DENSE_' + str(epochs) + '_epochs'
    model.save(modelName + '.h5')

In [None]:
#load model

loadModel = 0

if loadModel == 1:

  dataPath = '/content/drive/My Drive/Colab Notebooks/cnn_lstm_era/'

  if glacierStations == 1:
    modelName = str(num_stations) + '_stations_' + 'CNN_LSTM_DENSE_glacierStations_' + str(epochs) + '_epochs'
    model = load_model(dataPath + modelName + '.h5')
  else:
    modelName = str(num_stations) + '_stations_' + 'CNN_LSTM_DENSE_' + str(epochs) + '_epochs'
    model = load_model(dataPath + modelName + '.h5')

  #model = load_model(dataPath + str(num_stations) + '_stations_LSTM_DO_LSTM_DENSE.h5')

In [None]:
model_bulk = keras.models.clone_model(model)

In [None]:
#plot loss

saveIt = 0

if trainModel:

  loss = history.history['loss']
  val_loss = history.history['val_loss']
  epochs = range(1, len(loss) + 1)

  plt.plot(loss, 'y', label='Training')
  plt.plot(val_loss, 'r', label='Validation')
  plt.title('Training and Validation Loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()
  #plt.xlim((1,10))
  #plt.ylim((0,.1))
  #plt.show()

  if saveIt:
    plt.savefig('loss.png')

In [None]:
#predict streamflow with trained model

y_testPredict = model.predict(x_test, batch_size = 8192, verbose = 1)
y_predict = model.predict(x, batch_size=8192, verbose = 1)

In [None]:
#compute NSE

obs_per_station_test = int(len(y_test)/num_stations)
window = 366
NSE = nse(y_test,y_testPredict)
NSE_rolling = nse_rolling(y_test, y_testPredict, window, stride = 365)
NSE_station = [nse(y_test[kk*obs_per_station_test:(kk+1)*obs_per_station_test], y_testPredict[kk*obs_per_station_test:(kk+1)*obs_per_station_test]) for kk in range(num_stations)]

print('Overall NSE = ' + str(NSE)[:4])
print('Mean Station NSE = ' + str(np.mean(NSE_station))[:4])
print('Median Station NSE = ' + str(np.median(NSE_station))[:4])

In [None]:
#visualize stations' performance

sns.set(color_codes=True)
sns.distplot(NSE_station, bins = 5)
plt.show()

In [None]:
#plot model vs observed scatter plot

saveIt = 0

plt.figure(figsize = (8,8))

plt.scatter(y_test, y_testPredict, alpha = 0.1)
plt.xlabel('observation')
plt.ylabel('model')
plt.title('Model Results: NSE = ' + str(NSE)[:4])
#plt.xlim((-2,5))
#plt.xlim((0,4))
#plt.yscale('log')
#plt.xscale('log')

if saveIt:
  plt.savefig('obs_vs_model.png')

In [None]:
#plot time series of model and observations 

saveIt = 0

plt.figure(figsize = (12,8))
plt.subplot(2,1,1)
plt.plot(y_test, label = 'Observed')
plt.plot(y_testPredict, label = 'Modelled')
plt.legend()
plt.xlabel('Day')
plt.ylabel('Normalized Streamflow')
plt.title('Model Results: NSE = ' + str(NSE)[:4])
plt.xlim((0,len(y_test)))
plt.xlim((4*365*4,4*365*5))
plt.ylim((0,3))

plt.subplot(2,1,2)
#plt.plot(range(int(window/2),len(y_test)-int(window/2)),NSE_rolling)
plt.scatter(range(len(NSE_rolling)),NSE_rolling)
plt.ylim((0,1))
plt.xlim((0,len(NSE_rolling)))
plt.xlim((15.5,19.5))
plt.xlabel('Day')
plt.ylabel('NSE')
plt.title('Rolling NSE: Window = ' + str(window) + ' Days')

plt.tight_layout()

if saveIt:
  plt.savefig('modelled_time_series.png')

plt.show()