 this file is to generate predictions based on input data

In [1]:
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn import preprocessing
from keras.models import load_model
from datafunctions import convert_data, normalise, gen_sequence

np.random.seed(1010)
PYTHONHASHSEED = 0

model_path = "./finfanOUT/bin_model.h5"

# load the model
if os.path.isfile(model_path):
    estimator = load_model(model_path)


 Set up the data to generate predictions on

In [2]:
# constant definitions take these from what the initial model was trained with
sequence_length = 50
minmax_scaler = preprocessing.MinMaxScaler()
# below should be all input variables given to the model
# list of all sensors used
sensor_cols = ["speed"]
# list of other variables
sequence_cols = ["status"]
sequence_cols.extend(sensor_cols)
# read in data to predict on
test_df2 = pd.read_csv(
    "./finfanIn/Data_Extract_Train2.txt", sep="\t", header=None, low_memory=False
)
test_dfs = [test_df2]

In [3]:
# converts the list of training dataframes to the correct format and adds it a single dataframe with an id column
# it cleans the data and merges all the time columns
# and normalises the data using the scaler chosen, by default uses minmax which scales it to between 0 and 1
test_df = None
for i, t in enumerate(test_dfs):
    if test_df is None:
        test_df = normalise(convert_data(t, i), minmax_scaler)
    else:
        test_df = pd.concat(
            [test_df, normalise(convert_data(t, i), minmax_scaler)],
            copy=False,
        )
print(test_df)


                      time   speed  status  id
0      2018-02-08 11:37:30  2086.0     0.0   0
1      2018-02-08 11:38:30  2085.0     0.0   0
2      2018-02-08 11:39:30  2086.0     0.0   0
3      2018-02-08 11:40:30  2085.0     0.0   0
4      2018-02-08 11:42:30  2086.0     0.0   0
...                    ...     ...     ...  ..
289615 2018-07-29 07:12:58    -3.0     1.0   0
289616 2018-07-29 15:04:29    -3.0     1.0   0
289617 2018-07-30 06:47:29    -3.0     1.0   0
289618 2018-07-30 09:36:29    -3.0     1.0   0
289619 2018-07-30 09:38:59    -3.0     1.0   0

[289620 rows x 4 columns]
                       time     speed  status  id  time_norm
0       1518089850000000000  0.993343     0.0   0   0.182789
1       1518089910000000000  0.992867     0.0   0   0.182793
2       1518089970000000000  0.993343     0.0   0   0.182796
3       1518090030000000000  0.992867     0.0   0   0.182799
4       1518090150000000000  0.993343     0.0   0   0.182806
...                     ...       ...     .

In [4]:
# generates the data chunks to be input into the model from the data previously formatted above
seq_gen = (
    list(gen_sequence(test_df[test_df["id"] == id], sequence_length, sequence_cols))
    for id in test_df["id"].unique()
)

seq_array = np.concatenate(list(seq_gen)).astype(np.float32)

In [5]:
# generate predictions for the input data comes out as a number between 0 and 1 for the confidence that it is within window until failure
predictions = estimator.predict(seq_array, verbose=1, batch_size=200)
print(predictions)


[[0.14063083]
 [0.1406382 ]
 [0.14063159]
 ...
 [0.02087164]
 [0.02087165]
 [0.02087165]]
