In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.optimizers import Adam
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import pandas as pd
import datetime as dt

from sklearn.preprocessing import StandardScaler

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

**Data manipulation functions**

In [2]:
def format_multiple_bins_for_LSTM(
    dataframe,
    target_column_index,
    history_size,
    target_size, 
    step
):

    data = []
    labels = []
    
    spatial_bins = dataframe.groupby(['lat', 'lon'])
    
    for bin_name, spatial_bin in spatial_bins:
        
        spatial_bin = spatial_bin.sort_index()
        
        spatial_bin = np.array(spatial_bin.values)
        target = spatial_bin[:, target_column_index]
        
        bin_data = []
        bin_labels = []
    
        start_index = history_size
        end_index = len(spatial_bin) - target_size

        for i in range(start_index, end_index):
            indices = range(i - history_size, i, step)
            bin_data.append(spatial_bin[indices])
            bin_labels.append(target[i + target_size])

        data.append(np.array(bin_data))
        labels.append(np.array(bin_labels))

    return data, labels

In [3]:
data_file = '../data/training_data/1992-2015_training_data_added_features.csv'

# Datatypes for dataframe loading
dtypes = {
    'lat': float,
    'lon': float,
    'weather_bin_year': int,
    'weather_bin_month': int,
    'weather_bin_day': int,
    'air.2m': float,
    'apcp': float,
    'rhum.2m': float,
    'dpt.2m': float,
    'pres.sfc': float,
    'uwnd.10m': float,
    'vwnd.10m': float,
    'veg': float,
    'vis': float,
    'ignition': float,
    'mean.air.2m': float,
    'mean.apcp': float,
    'mean.rhum.2m': float,
    'mean.dpt.2m': float,
    'mean.pres.sfc': float,
    'mean.uwnd.10m': float,
    'mean.vwnd.10m': float,
    'mean.veg': float,
    'mean.vis': float,
    'max.air.2m': float,
    'max.apcp': float,
    'max.rhum.2m': float,
    'max.dpt.2m': float,
    'max.pres.sfc': float,
    'max.uwnd.10m': float,
    'max.vwnd.10m': float,
    'max.veg': float,
    'max.vis': float,
    'min.air.2m': float,
    'min.apcp': float,
    'min.rhum.2m': float,
    'min.dpt.2m': float,
    'min.pres.sfc': float,
    'min.uwnd.10m': float,
    'min.vwnd.10m': float,
    'min.veg': float,
    'min.vis': float,
    'total_fires': float

}

# Features to use during training 
features = [
    'lat',
    'lon',
    'weather_bin_year',
    'weather_bin_month',
    'weather_bin_day',
    'veg',
    'ignition',
    'mean.air.2m',
    'mean.apcp',
    'mean.rhum.2m',
    'mean.dpt.2m',
    'mean.pres.sfc',
    'mean.uwnd.10m',
    'mean.vwnd.10m',
    'mean.veg',
    'mean.vis',
    'total_fires'
]

features_to_scale = [
    'weather_bin_year',
    'weather_bin_month',
    'veg',
    'mean.air.2m',
    'mean.apcp',
    'mean.rhum.2m',
    'mean.dpt.2m',
    'mean.pres.sfc',
    'mean.uwnd.10m',
    'mean.vwnd.10m',
    'mean.vis',
    'total_fires'
]

In [4]:
raw_data = pd.read_csv(data_file, index_col=0, parse_dates=True, dtype=dtypes)

In [5]:
raw_data.head()

Unnamed: 0,lat,lon,weather_bin_year,weather_bin_month,weather_bin_day,air.2m,apcp,rhum.2m,dpt.2m,pres.sfc,...,min.air.2m,min.apcp,min.rhum.2m,min.dpt.2m,min.pres.sfc,min.uwnd.10m,min.vwnd.10m,min.veg,min.vis,total_fires
1992-05-21,32.68389,-117.1809,1992,5,21,292.07899,0.019531,77.730052,287.287171,100208.78575,...,287.399184,0.0,44.981636,285.914549,100190.64825,-2.124688,-0.066462,15.6,9910.28,1592.0
1992-07-02,32.68389,-117.1809,1992,7,2,292.275739,0.000757,75.398328,287.056254,100366.05925,...,287.689669,0.0,46.80591,286.502401,100122.39125,-0.49009,-0.863408,15.6,9742.057125,1592.0
1992-07-30,32.68389,-117.1809,1992,7,30,294.410629,0.003906,76.250808,289.455551,100466.27125,...,289.904944,0.0,50.541946,289.185196,100422.2775,-0.668172,0.388485,15.6,17817.148625,1592.0
1993-03-11,32.68389,-117.1809,1993,3,11,287.692444,0.00024,86.818202,285.193045,100778.05575,...,283.378929,0.0,63.503744,282.942706,100647.46825,-2.139523,-3.294209,15.7,8705.802,1592.0
1993-04-19,32.68389,-117.1809,1993,4,19,289.10293,0.001942,82.173999,285.493621,100517.83125,...,284.245581,0.0,50.257054,284.049155,100189.34375,-1.8477,-3.232248,15.7,8783.02375,1592.0


In [6]:
# Pull out columns of intrest
data = raw_data[features]

In [7]:
# Sort by date time index
# one_bin_training_data = one_bin_training_data.sort_index()
data = data.sort_index()

In [8]:
# Drop unnecessary columns
# one_bin_training_data = one_bin_training_data.drop(['lat', 'lon', 'weather_bin_day'], axis=1)
data = data.drop(['weather_bin_day'], axis=1)

In [9]:
# Scale data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data[features_to_scale])
data[features_to_scale] = scaled_features

In [10]:
# Split data up into training, testing and validation sets
test_data = data.tail(int(len(data)*0.1))
leftover_data = data.iloc[:-int(len(data)*0.1)]

validation_data = data.tail(int(len(leftover_data)*0.3))
training_data = data.iloc[:-int(len(leftover_data)*0.3)]

In [11]:
target_column_index = 5
past_history = 3
future_target = 1
step = 1

x_train, y_train = format_multiple_bins_for_LSTM(
    training_data, 
    target_column_index, 
    past_history,
    future_target, 
    step,
)

x_validation, y_validation = format_multiple_bins_for_LSTM(
    validation_data, 
    target_column_index, 
    past_history,
    future_target, 
    step,
)

x_test, y_test = format_multiple_bins_for_LSTM(
    test_data, 
    target_column_index, 
    past_history,
    future_target, 
    step,
)

In [12]:
sample_sizes = []

for sample in y_train:
    sample_sizes.append(len(sample))
    
smallest_sample = min(sample_sizes)

y_train_reshaped = []

for i in range(smallest_sample):
    y = []
    for j in range(len(y_train)):
        try:
            y.append(y_train[j][i])
        except:
            print("Index out of range")
    
    y_train_reshaped.append(np.array(y))
    
trimmed_x_training = []    
    
for sample in x_train:
    trimmed_sample = sample[-smallest_sample:,:]
    trimmed_x_training.append(trimmed_sample)

In [13]:
sample_sizes = []

for sample in y_validation:
    sample_sizes.append(len(sample))
    
smallest_sample = min(sample_sizes)
y_validation_reshaped = []

for i in range(smallest_sample):
    y = []
    for j in range(len(y_validation)):
        try:
            y.append(y_validation[j][i])
        except:
            print("Index out of range")
    
    y_validation_reshaped.append(np.array(y))
    
trimmed_x_validation = []    
    
for sample in x_validation:
    trimmed_sample = sample[-smallest_sample:,:]
    trimmed_x_validation.append(trimmed_sample)

In [14]:
sample_sizes = []

for sample in y_test:
    sample_sizes.append(len(sample))
    
smallest_sample = min(sample_sizes)
y_test_reshaped = []

for i in range(smallest_sample):
    y = []
    for j in range(len(y_test)):
        try:
            y.append(y_test[j][i])
        except:
            print("Index out of range")
    
    y_test_reshaped.append(np.array(y))
    
trimmed_x_test = []    
    
for sample in x_test:
    trimmed_sample = sample[-smallest_sample:,:]
    trimmed_x_test.append(trimmed_sample)

In [15]:
x_train_sample = trimmed_x_training[:2]
y_train_sample = np.array(y_train_reshaped)[:,:2]

x_train_a = x_train_sample[0]
x_train_b = x_train_sample[1]

In [16]:
x_validation_sample = trimmed_x_validation[:2]
y_validation_sample = np.array(y_validation_reshaped)[:,:2]

x_validation_a = x_validation_sample[0]
x_validation_b = x_validation_sample[1]

In [17]:
x_train_sample = trimmed_x_training[:2]
y_train_sample = np.array(y_train_reshaped)[:,:2]

x_train_a = x_train_sample[0]
x_train_b = x_train_sample[1]

In [18]:
lstm_units = 30
dense_units = 30
l2_lambda = 0.1
learning_rate = 0.1

initial_bias = -1.4
output_bias = tf.keras.initializers.Constant(initial_bias)

weight_for_0 = 0.5 
weight_for_1 = 13

class_weight = {0: weight_for_0, 1: weight_for_1}

evaluation_interval = 100
epochs = 10

metrics = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

In [19]:
input_a = keras.Input(shape=x_train_a.shape[-2:])
input_b = keras.Input(shape=x_train_a.shape[-2:])

LSTM_a = keras.layers.LSTM(lstm_units)(input_a)
LSTM_b = keras.layers.LSTM(lstm_units)(input_b)

merged = keras.layers.concatenate([LSTM_a, LSTM_b])

hidden_output = keras.layers.Dense(dense_units)(merged)

output = keras.layers.Dense(2)(hidden_output)

model = keras.Model(inputs=[input_a, input_b], outputs=output)

model.compile(
    optimizer=tf.keras.optimizers.Adam(lr=learning_rate), 
    loss=keras.losses.BinaryCrossentropy(),
    #metrics=metrics
)

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3, 16)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 3, 16)]      0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 30)           5640        input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 30)           5640        input_2[0][0]                    
______________________________________________________________________________________________

In [20]:
print(x_train_a.shape)
print(x_train_b.shape)
print(y_train_sample.shape)

(6395, 3, 16)
(6395, 3, 16)
(6395, 2)


In [21]:
print(x_validation_a.shape)
print(x_validation_b.shape)
print(y_validation_sample.shape)

(2362, 3, 16)
(2362, 3, 16)
(2362, 2)


In [22]:
model.fit(
    [x_train_a, x_train_b], 
    y_train_sample,
    #batch_size=64, 
    epochs=5,
    validation_data=([x_validation_a, x_validation_b], y_validation_sample),
    class_weight=class_weight
)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 6395 samples, validate on 2362 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f66a3e19da0>