In [1]:
#import modules
import numpy as np
import pandas as pd
import time
from keras import backend as K 
from keras.engine.training import Model
from tensorflow.keras.utils import to_categorical, plot_model
from keras.models import Sequential, load_model
from sklearn.preprocessing import MinMaxScaler
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers import Dropout, Activation, Flatten
from keras.layers import LSTM, Dense, Input
from keras.optimizers import SGD
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [3]:
#import data
df = pd.read_csv('./Sub0-RAW.csv')
df.head()

Unnamed: 0,Dates,Year,Month,Day,Hours,Precipitation,Temperature,Outflow
0,10/1/1980,1980,10,1,0:00:00,0.0,102.992,5.0
1,10/1/1980,1980,10,1,1:00:00,0.0,97.79,5.0
2,10/1/1980,1980,10,1,2:00:00,0.0,92.588,5.0
3,10/1/1980,1980,10,1,3:00:00,0.0,87.404,5.0
4,10/1/1980,1980,10,1,4:00:00,0.0,85.172,5.0


In [4]:
#define train and label column
train_cols = ["Outflow"]
label_cols = ["Outflow(t+1)"]

In [5]:
#filter the columns that are used for training and testing
filtered_df = df.filter(["Precipitation","Temperature","Outflow"], axis=1)

In [6]:
#To convert from hourly data to daily data
oflow = filtered_df['Outflow'].tolist()
tem = filtered_df['Temperature'].tolist()
precp = filtered_df['Precipitation'].tolist()

#Summing up the outflow data
lengthOfData = len(oflow)

#summing 24 hours outflow
totalOutflow = []
for i in range(0, lengthOfData, 24):
  totalOutflow.append(sum(oflow[i:i+24]))

#averaging 24 hours temperature
averageTemperature = []
for i in range(0, lengthOfData, 24):
  averageTemperature.append((sum(tem[i:i+24]))/24)

#summing 24 hours precipitation
totalPrecipitation = []
for i in range(0, lengthOfData, 24):
  totalPrecipitation.append(sum(precp[i:i+24]))

In [7]:
#new dataframe for daily data
new_df = pd.DataFrame({'Precipitation': totalPrecipitation, 'Temperature': averageTemperature, 
                       'Outflow': totalOutflow}, columns=['Precipitation', 'Temperature', 'Outflow'])

In [8]:
new_df['ExtremeOrNot'] = np.where(new_df['Outflow'] > 5000, 1, 0)

In [9]:
#dataframe for network 1 without extreme values
df_net1 = new_df[new_df['Outflow'] <= 5000]

In [10]:
#label of network for discriminator network
labels_discriminator = ["ExtremeOrNot"]

In [11]:
#Min Max scalar normalizing
xtrain_min_max_scaler = MinMaxScaler(feature_range = (0, 1))
ytrain_min_max_scaler = MinMaxScaler(feature_range = (0, 1))
xtest_min_max_scaler = MinMaxScaler(feature_range = (0, 1))
ytest_min_max_scaler = MinMaxScaler(feature_range = (0, 1))
#Min Max scalar normalizing for extreme values
xtrain_min_max_scaler_ext = MinMaxScaler(feature_range = (0, 1))
ytrain_min_max_scaler_ext = MinMaxScaler(feature_range = (0, 1))
xtest_min_max_scaler_ext = MinMaxScaler(feature_range = (0, 1))
ytest_min_max_scaler_ext = MinMaxScaler(feature_range = (0, 1))

In [12]:
#function to shift the time_series data for getting labels
def lag_seq(df, n_seq):
    for i in range(n_seq):
        df['Outflow(t+%d)' %(i+1)] = new_df['Outflow'].shift(-(i+1))
    return df

#calling function to create lag dataframe for network 3
lag_df = lag_seq(new_df, 1)
lag_df.dropna(inplace=True)

#calling function to create lag dataframe for network 1
lag_df_net1 = lag_seq(df_net1, 1)
lag_df_net1.dropna(inplace=True)

#Splitting training and test data for network 3
df_train, df_test = train_test_split(lag_df, train_size=0.8, test_size=0.2, shuffle=False)
#splitting training and test data for network 1
df_train_net1, df_test_net1 = train_test_split(lag_df_net1, train_size=0.8, test_size=0.2, shuffle=False)
# train data for network1
x_train1 = df_train_net1.loc[:,train_cols].values
y_train1 = df_train_net1.loc[:,label_cols].values
#train for network3
x_train3 = df_train.loc[:,train_cols].values
y_train3 = df_train.loc[:,labels_discriminator].values  #for discriminator
#test data for network 1
x_test1 = df_test_net1.loc[:,train_cols].values
y_test1 = df_test_net1.loc[:,label_cols].values 
#test data for network3
x_test3 = df_test.loc[:,train_cols].values
y_test3 = df_test.loc[:,labels_discriminator].values    #for discriminator

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [13]:
# building timeseries data with given timesteps
def timeseries(X, Y, Y_actual, time_steps, out_steps):
    input_size_0 = X.shape[0] - time_steps
    input_size_1 = X.shape[1]
    X_values = np.zeros((input_size_0, time_steps, input_size_1))
    Y_values = np.zeros((input_size_0,))
    Y_values_actual = np.zeros((input_size_0,))
    
    for i in tqdm_notebook(range(input_size_0)):
        X_values[i] = X[i:time_steps+i]
        Y_values[i] = Y[time_steps+i-1, 0]
        Y_values_actual[i] = Y_actual[time_steps+i-1, 0]
        
    print("length of time-series i/o",X_values.shape,Y_values.shape)
    return X_values, Y_values, Y_values_actual

In [14]:
# checking mse for train set that are greater than threshold
x_train = df_train.loc[:,train_cols].values
y_train = df_train.loc[:,label_cols].values
x_test = df_test.loc[:,train_cols].values
y_test = df_test.loc[:,label_cols].values

threshold = 5000
timesteps = 18
train_values_above_thres = []
train_labels_above_theres = []

for i in range(timesteps, x_train.shape[0]):
    if (y_train[i] > threshold):
        train_values_above_thres.append(x_train[i-timesteps:i])
        train_labels_above_theres.append(y_train[i]) 
        
X_train_abv_thres = np.array(train_values_above_thres)
Y_train_abv_thres = np.array(train_labels_above_theres)

tsamples, ta, tb = X_train_abv_thres.shape
x_train_for_normalization = X_train_abv_thres.reshape((tsamples,ta*tb)) # since normalization requires 2d array
x_train_for_normalization.shape

X_Train_abv_thres = xtrain_min_max_scaler_ext.fit_transform(x_train_for_normalization)
Y_Train_abv_thres = ytrain_min_max_scaler_ext.fit_transform(Y_train_abv_thres)

X_Train_abv_thres = X_Train_abv_thres.reshape((tsamples, ta, tb))

In [15]:
# checking mse for test set that are greater than threshold
test_values_above_thres = []
test_labels_above_theres = []

for i in range(timesteps, x_test.shape[0]):
    if (y_test[i] > threshold):
        test_values_above_thres.append(x_test[i-timesteps:i])
        test_labels_above_theres.append(y_test[i]) 
        
X_test_abv_thres = np.array(test_values_above_thres)
Y_test_abv_thres = np.array(test_labels_above_theres)

tsamples, ta, tb = X_test_abv_thres.shape
x_test_for_normalization = X_test_abv_thres.reshape((tsamples,ta*tb)) # since normalization requires 2d array
x_test_for_normalization.shape

X_Test_abv_thres = xtest_min_max_scaler_ext.fit_transform(x_test_for_normalization)
Y_Test_abv_thres = ytest_min_max_scaler_ext.fit_transform(Y_test_abv_thres)

X_Test_abv_thres = X_Test_abv_thres.reshape((tsamples, ta, tb))

In [16]:
#data for Network1
def data_processing(x_train, y_train, x_test, y_test): 
   
    #Normalizing training data
    x_train_nor = xtrain_min_max_scaler.fit_transform(x_train)
    y_train_nor = ytrain_min_max_scaler.fit_transform(y_train)

    # Normalizing test data
    x_test_nor = xtest_min_max_scaler.fit_transform(x_test)
    y_test_nor = ytest_min_max_scaler.fit_transform(y_test)
    
    # Saving actual train and test y_label
    y_train_actual = y_train
    y_test_actual = y_test
    
    #Building timeseries
    X_Train, Y_Train, Y_train_actual = timeseries(x_train_nor, y_train_nor, y_train_actual, time_steps=18, out_steps=1)
    X_Test, Y_Test, Y_test_actual = timeseries(x_test_nor, y_test_nor, y_test_actual, time_steps=18, out_steps=1)
    
    return X_Train, Y_Train, Y_train_actual, X_Test, Y_Test, Y_test_actual

In [40]:
# data for Network3

#Normalizing training data
   
x_train_nor = xtrain_min_max_scaler.fit_transform(x_train3)

# Normalizing test data
x_test_nor = xtest_min_max_scaler.fit_transform(x_test3)

#SMOTE algorithm
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_sample(x_train_nor, y_train3.ravel())
X_test_res, y_test_res = sm.fit_sample(x_test_nor, y_test3.ravel())

# Saving actual train and test y_label
y_train_actual = y_train_res
y_test_actual = y_test_res


In [41]:
y_train3.shape

(6184, 1)

In [42]:
sum(y_train_res == 0)

6094

In [43]:
time_steps = 18
input_size_0 = X_train_res.shape[0] - time_steps
input_size_1 = X_train_res.shape[1]

In [44]:
X_values = np.zeros((input_size_0, time_steps, input_size_1))
Y_values = np.zeros((input_size_0,))
Y_values_actual = np.zeros((input_size_0,))

In [45]:
for i in tqdm_notebook(range(input_size_0)):
    X_values[i] = X_train_res[i:time_steps+i]
    Y_values[i] = y_train_res[time_steps+i-1]
    Y_values_actual[i] = y_train_actual[time_steps+i-1]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12170.0), HTML(value='')))




In [46]:
Y_values.shape

(12170,)

In [47]:
y_train3.shape

(6184, 1)