## Model Training
#### (Use this notebook to train the neural network)

In [1]:
#import libraries
import pandas as pd 
from datetime import datetime, timedelta
from glob import glob 
import tensorflow as tf
from keras import Input, Model
from keras.layers import Dense
import plotly.express as px

In [2]:
#get pressure data
df = []
for f in glob('data/tank_header_pressure_*.csv'):
    df.append(pd.read_csv(f))

df = pd.concat(df)
df.drop(columns=["TagType"], inplace=True)
df.timestamp = pd.to_datetime(df.timestamp)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2246256 entries, 0 to 246255
Data columns (total 3 columns):
 #   Column        Dtype         
---  ------        -----         
 0   timestamp     datetime64[ns]
 1   FACILITY_ID   int64         
 2   pressure_osi  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 68.6 MB


In [3]:
# known data to build training set from
knowns = [
    (10085941,[("2021-10-19 22:42:00","2021-10-28 14:57:00")]),
    (10085510,[("2022-02-14 16:43:21","2022-05-13 15:28:13")]),
    (10086098,[("2022-07-24 21:12:00","2022-09-21 16:57:54")]),
    (10085544,[("2022-10-13 20:40:25","2022-11-27 16:55:14")]),
    (10111756,[("2022-11-12 12:00:00","2023-02-23 12:00:00")]),
    (10085464,[("2022-10-13 00:00:01","2022-10-13 00:00:00")]),#not open
    (10085694,[("2022-10-13 00:00:01","2022-11-27 00:00:00")]),#not open
    (10086083,[("2022-11-23 23:33:29","2022-12-09 14:01:30")]),
    (10085465,[("2022-05-01 07:23:24","2022-06-09 09:10:06"),("2022-10-05 06:58:31","2022-12-23 22:49:09")]),
    (10167456,[("2022-10-13 00:00:01","2022-11-27 00:00:00")]),#not open
    (10086084,[("2021-11-28 17:35:41","2022-01-27 05:04:22"),("2022-06-02 18:53:01","2022-06-23 19:44:12")]),
    (10085615,[("2022-10-13 00:00:01","2022-11-27 00:00:00")]),#not open
    (10094669,[("2023-01-31 06:04:39","2023-02-02 17:42:06"),("2023-02-11 01:24:57","2023-02-11 13:42:51")]),
    (10085637,[("2022-10-13 00:00:01","2022-11-27 00:00:00")]),#not open
]

In [4]:
#build training dataframe
COLUMNS = ["isOpen", "facilityMean", "facilitySD", "localMean", "localSD"]

all_data = []
for facility_id, times in knowns:
    #filter dataframe by specific facility id
    df_facility = df[df.FACILITY_ID == facility_id].copy()
    #time the hatch was opened and closed
    openTimes = []
    for open, closed in times:
        openTimes.append((pd.to_datetime(pd.Timestamp(open)), pd.to_datetime(pd.Timestamp(closed))))
    #get facility info
    facilityMean = df_facility["pressure_osi"].mean()
    facilitySD = df_facility["pressure_osi"].std()
    #time to start getting data
    tStart = df_facility["timestamp"].min().round("4H") + timedelta(days=2)
    tEnd = df_facility["timestamp"].max().round("4H")
    #how often to add a new datapoint
    tFreq = timedelta(hours=4)

    t = tStart
    while t < tEnd:
        #get local info
        isOpen = False
        for open, closed in openTimes:
            if open < t < closed: isOpen = True
        #data only from last 2 days
        dfLocal = df_facility[(df_facility.timestamp > t-timedelta(days=2))&
                              (df_facility.timestamp < t)]
        localMean = dfLocal["pressure_osi"].mean()
        localSD = dfLocal["pressure_osi"].std()

        #append a new datapoint
        all_data.append((isOpen, facilityMean, facilitySD, localMean, localSD))

        t += tFreq

#convert the list of datapoints to a dataframe
df_all_data = pd.DataFrame(all_data, columns=COLUMNS)
#drop any rows with a NaN
df_all_data.dropna(inplace=True)

df_all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21001 entries, 0 to 21082
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   isOpen        21001 non-null  bool   
 1   facilityMean  21001 non-null  float64
 2   facilitySD    21001 non-null  float64
 3   localMean     21001 non-null  float64
 4   localSD       21001 non-null  float64
dtypes: bool(1), float64(4)
memory usage: 840.9 KB


In [5]:
#convert the dataframe to a tensorflow dataset
text = df_all_data[COLUMNS[1:]]
targets = df_all_data[COLUMNS[0]]
dataset_all = tf.data.Dataset.from_tensor_slices((text, targets))
dataset_all

<TensorSliceDataset element_spec=(TensorSpec(shape=(4,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.bool, name=None))>

In [6]:
VALIDATION_SET_SIZE = 0.2
BATCH_SIZE = 8
LEN = len(dataset_all)

#shuffle dataset
dataset_all = dataset_all.shuffle(LEN)

#split dataset into training and validation datasets
split = int(LEN * VALIDATION_SET_SIZE)
dataset_training = dataset_all.skip(split).take(LEN-split)
dataset_validation = dataset_all.take(split)

#batch datasets
dataset_training = dataset_training.batch(BATCH_SIZE)
dataset_validation = dataset_validation.batch(BATCH_SIZE)

print("Training set:", dataset_training)
print("Training set size:", len(dataset_training))
print("Validation set:", dataset_validation)
print("Validation set size:", len(dataset_validation))

Training set: <BatchDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.bool, name=None))>
Training set size: 2101
Validation set: <BatchDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.bool, name=None))>
Validation set size: 525


In [7]:
#function used to determine model accuracy (lower is better)
def error(true, pred):
    #diff is the exact difference between true and predicted
    true = tf.cast(true, float)
    diff = tf.subtract(true, pred)
    diff = tf.abs(diff)
    #incentive punishes the model 3x more for false negative
    incentive = tf.subtract(true, pred)
    incentive = tf.multiply(incentive, true)
    incentive = tf.multiply(incentive, 4)
    diff = tf.add(diff, incentive)
    return diff

In [8]:
tf.keras.backend.clear_session()

#set up the neural network
inputs = Input(shape=(4,))                      # input layer
x = Dense(8)(inputs)                            # dense layer
outputs = Dense(1, activation="sigmoid")(x)     # output layer

model = Model(inputs, outputs)

#compile the neural network
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001), loss=error)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense (Dense)               (None, 8)                 40        
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 49
Trainable params: 49
Non-trainable params: 0
_________________________________________________________________


In [9]:
#train the neural network
history = model.fit(dataset_training, validation_data=dataset_validation, epochs=30)

#plot the model's progress
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'Epoch', 'value': 'Error'}, 
    title='Training History')
fig

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [10]:
#verify model on a known test case and 4 known times
test_facility = 10085460
test_times = ['2022-08-17 15:14:00','2022-08-23 12:00:00','2022-09-05 19:45:00','2022-07-01 02:03:00']
df_460 = df[df.FACILITY_ID == test_facility].copy()
facilityMean = df_460["pressure_osi"].mean()
facilitySD = df_460["pressure_osi"].std()

preds = []
for t in test_times:
    time = pd.to_datetime(pd.Timestamp(t))
    dfLocal = df_460[(df_460.timestamp > time-timedelta(days=1))&(df_460.timestamp < time+timedelta(days=1))]
    localMean = dfLocal["pressure_osi"].mean()
    localSD = dfLocal["pressure_osi"].std()
    
    testInput = tf.constant([[facilityMean, facilitySD, localMean, localSD]])
    preds.append(model.predict(testInput)[0][0].round(20))

print("this should be close to 1 ->", preds[0])
print("this should be close to 1 ->", preds[1])
print("this should be close to 0 ->", preds[2])
print("this should be close to 0 ->", preds[3])

this should be close to 1 -> 0.9988559
this should be close to 1 -> 0.9988765
this should be close to 0 -> 1.4796717e-06
this should be close to 0 -> 5.633556e-05


In [11]:
#save the model to 'models/model'
model.save("models/model")

INFO:tensorflow:Assets written to: models/model\assets
