# LSTM Multiphase Model Training

Note: Make sure to run the notebook in virtualenv. 

## Loading the Data
The code below loads data and labels from `/research/rih-cs/datasets/elvo-multiphase`.

Each phase data is stored under `/research/rih-cs/datasets/elvo-multiphase/preprocessed`.

In [1]:
import os
import pathlib
import typing

import numpy as np

In [2]:
import  logging

def configure_logger():
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    root_logger.addHandler(handler)

## Train / Test / Val Split
We will iterate through the pos and neg directory of phase1 to get the index of our train/test/val set. 


In [3]:
TRAIN_DATA = ['P25', 'P48', 'P62', 'P72', 'P144', 'P149', 'P1', 'P4', 'P16', 'P21', 'P32', \
'P36', 'P38', 'P52', 'P59', 'P88', 'P89', 'P118', 'P164', 'P232', 'P255', 'P266', 'P280', \
'P289', 'P73', 'P78', 'P120', 'P142', 'P126', 'P145', 'P147', 'P3', 'P5', 'P6', 'P15', \
'P17', 'P22', 'P28', 'P29', 'P34', 'P57', 'P58', 'P61', 'P66', 'P68', 'P70', 'P77', 'P80', \
'P85', 'P87', 'P94', 'P102', 'P106', 'P107', 'P110', 'P125', 'P127', 'P130', 'P134', 'P135', \
'P141', 'P150', 'P152', 'P153', 'P158', 'P163', 'P166', 'P179', 'P180', 'P181', 'P182', 'P185', \
'P207', 'P209', 'P210', 'P216', 'P218', 'P222', 'P224', 'P225', 'P231', 'P8', 'P13', 'P18', \
'P24', 'P33', 'P40', 'P43', 'P44', 'P47', 'P51', 'P53', 'P56', 'P63', 'P67', 'P69', 'P81', \
'P100', 'P101', 'P111', 'P117', 'P124', 'P146', 'P168', 'P184', 'P187', 'P188', 'P208', 'P212', \
'P248', 'P112', 'P2', 'P10', 'P20', 'P26', 'P46', 'P60', 'P79', 'P93', 'P95', 'P98', 'P116', 'P121', \
'P136', 'P143', 'P148', 'P160', 'P189', 'P203', 'P71', 'P97', 'P140', 'P84', 'P92', 'P131', 'P7', \
'P42', 'P129', 'P137', 'P154', 'P159', 'P176', 'P201', 'P213', 'P9', 'P11', 'P12', 'P19', 'P23', \
'P27', 'P30', 'P31', 'P35', 'P39', 'P45', 'P54', 'P55', 'P64', 'P65', 'P74', 'P91', 'P96', 'P99', \
'P104', 'P105', 'P108', 'P109', 'P113', 'P114', 'P119', 'P122', 'P123', 'P128', 'P132', 'P133', 'P139', \
'P151', 'P155', 'P156', 'P157', 'P165', 'P169', 'P173', 'P174', 'P177', 'P183', 'P186', 'P190', 'P192', \
'P193', 'P194', 'P197', 'P199', 'P200', 'P202', 'P205', 'P14', 'P41', 'P49', 'P75', 'P83', 'P86', 'P90', \
'P103', 'P167', 'P171', 'P196', 'P198', 'P204', 'P214', 'P254', 'P191'] 

TEST_DATA = ['P252', 'P265', 'P162', 'P170', 'P172', 'P178', 'P195', 'P221', 'P253', 'P234', 'P236', 'P237', \
'P241', 'P262', 'P272', 'P277', 'P282', 'P284', 'P285', 'P288', 'P291', 'P293', 'P296', 'P220', 'P228', \
'P246', 'P250', 'P270', 'P273', 'P283', 'P302', 'P268', 'P292', 'P226', 'P245', 'P263', 'P269', 'P286', \
'P217', 'P219', 'P233', 'P244', 'P206', 'P211', 'P215', 'P223', 'P227', 'P235', 'P243', 'P257', 'P258', \
'P260', 'P261', 'P267', 'P275', 'P278', 'P264', 'P274', 'P276', 'P279', 'P242']

VAL_DATA = ['P271', 'P259', 'P238', 'P281', 'P229', 'P240', 'P297', 'P309', 'P310', 'P50', 'P76', 'P230', \
'P304', 'P305', 'P306', 'P307', 'P308', 'P300', 'P290', 'P298', 'P299', 'P249', 'P239', 'P294', 'P301', \
'P303', 'P161', 'P256', 'P37', 'P287', 'P295', 'P82', 'P247'] 

In [4]:
data_path = '/research/rih-cs/datasets/elvo-multiphase/preprocessed/'

In [5]:
# LENGTH, WIDTH, HEIGHT = (3, 230, 230)
TRAIN_INDICES = []
TEST_INDICES = []
VAL_INDICES = []

# Usage: np.stack(train_arrays)
train_arrays = []
test_arrays = []
val_arrays = []

def load_training_data(): 
    """
    Returns 4D matrix of training data
    Data is in the form (n_samples, 1, w, h). 
    Samples are sorted respectively according to the specs in TRAIN_DATA, TEST_DATA, VAL_DATA
    """

    phase1_pos_files = sorted(os.listdir(data_path + 'phase1/pos/'))
    for i, filename in enumerate(phase1_pos_files):
        arr = np.load(data_path + 'phase1/pos/' + filename)
        matching_name = os.path.splitext(filename)[0] 
        if matching_name in TRAIN_DATA:
            train_arrays.append(arr)
            TRAIN_INDICES.append(i)
        elif matching_name in TEST_DATA: 
            test_arrays.append(arr)
            TEST_INDICES.append(i)
        elif matching_name in VAL_DATA: 
            val_arrays.append(arr)
            VAL_INDICES.append(i)
        else: 
            logging.info(
            f'training file {filename}, {matching_name} is not found.')

In [6]:
load_training_data()
# 12/02/2018 has 406 positive dataset

In [7]:
len(TRAIN_INDICES)

95

In [8]:
len(TEST_INDICES)

36

In [9]:
len(VAL_INDICES)

22

## Processing into one input for LSTM

In [33]:
# Set up the multiple (three) parallel phases as input for the LSTM model 
# Doc: https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/

# positive i = 0 - 94; negative i = 95 - 172 
lstm_input = np.zeros((162, 3, 3 * 230 * 230))

phase1_full_path = data_path + 'phase1/pos/'
phase2_full_path = data_path + 'phase2/pos/'
phase3_full_path = data_path + 'phase3/pos/'

# There are 96 files in each directory. 0-66 (training); 67-86 (testing); 87-95 (validation)
neg_phase1_full_path = data_path + 'phase1/neg/'
neg_phase2_full_path = data_path + 'phase2/neg/'
neg_phase3_full_path = data_path + 'phase3/neg/'

def create_lstm_training_input(): 
    phase1_pos_files = sorted(os.listdir(phase1_full_path))
    phase2_pos_files = sorted(os.listdir(phase2_full_path))
    phase3_pos_files = sorted(os.listdir(phase3_full_path))
    
    i = 0 
    # TRAIN_INDICES is selected in the google spreadsheet based on the data's location
    for index in TRAIN_INDICES: 
        phase1_arr = np.load(phase1_full_path + phase1_pos_files[index])
        phase2_arr = np.load(phase2_full_path + phase2_pos_files[index])
        phase3_arr = np.load(phase3_full_path + phase3_pos_files[index])
        
        # TODO: check if the resize array is good 
        re_phase1_arr = np.resize(phase1_arr, (3, 230, 230))
        re_phase2_arr = np.resize(phase2_arr, (3, 230, 230))
        re_phase3_arr = np.resize(phase3_arr, (3, 230, 230))
        
        re_phase1_arr = re_phase1_arr.reshape(3 * 230 * 230)
        re_phase2_arr = re_phase2_arr.reshape(3 * 230 * 230)
        re_phase3_arr = re_phase3_arr.reshape(3 * 230 * 230)
        
        lstm_input[i] = np.array([re_phase1_arr,re_phase2_arr,re_phase3_arr])
        i += 1     
    
    phase1_neg_files = sorted(os.listdir(neg_phase1_full_path))
    phase2_neg_files = sorted(os.listdir(neg_phase2_full_path))
    phase3_neg_files = sorted(os.listdir(neg_phase3_full_path))
    
    # As illustrated above, the first 66 negative data would be in the training set (0-66) 
    for neg_index in range(67): 
        phase1_arr = np.load(neg_phase1_full_path + phase1_neg_files[neg_index])
        phase2_arr = np.load(neg_phase2_full_path + phase2_neg_files[neg_index])
        phase3_arr = np.load(neg_phase3_full_path + phase3_neg_files[neg_index])
        
        re_phase1_arr = np.resize(phase1_arr, (3, 230, 230))
        re_phase2_arr = np.resize(phase2_arr, (3, 230, 230))
        re_phase3_arr = np.resize(phase3_arr, (3, 230, 230))
        
        re_phase1_arr = re_phase1_arr.reshape(3 * 230 * 230)
        re_phase2_arr = re_phase2_arr.reshape(3 * 230 * 230)
        re_phase3_arr = re_phase3_arr.reshape(3 * 230 * 230)
        
        lstm_input[i] = np.array([re_phase1_arr,re_phase2_arr,re_phase3_arr])
        i += 1     
        
    print(i)
    return lstm_input
    

## LSTM input for validation 

In [51]:
# Set up the multiple (three) parallel phases as input for the LSTM model 
# Doc: https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/

# positive i = 0-21; negative i = 87-95
lstm_val_input = np.zeros((31, 3, 3 * 230 * 230))

phase1_full_path = data_path + 'phase1/pos/'
phase2_full_path = data_path + 'phase2/pos/'
phase3_full_path = data_path + 'phase3/pos/'

# There are 96 files in each directory. 0-66 (training); 67-86 (testing); 87-95 (validation)
neg_phase1_full_path = data_path + 'phase1/neg/'
neg_phase2_full_path = data_path + 'phase2/neg/'
neg_phase3_full_path = data_path + 'phase3/neg/'

def create_lstm_val_input(): 
    phase1_pos_files = sorted(os.listdir(phase1_full_path))
    phase2_pos_files = sorted(os.listdir(phase2_full_path))
    phase3_pos_files = sorted(os.listdir(phase3_full_path))
    
    j = 0 
    # TRAIN_INDICES is selected in the google spreadsheet based on the data's location
    for index in VAL_INDICES: 
        phase1_arr = np.load(phase1_full_path + phase1_pos_files[index])
        phase2_arr = np.load(phase2_full_path + phase2_pos_files[index])
        phase3_arr = np.load(phase3_full_path + phase3_pos_files[index])
        
        re_phase1_arr = np.resize(phase1_arr, (3, 230, 230))
        re_phase2_arr = np.resize(phase2_arr, (3, 230, 230))
        re_phase3_arr = np.resize(phase3_arr, (3, 230, 230))
        
        re_phase1_arr = re_phase1_arr.reshape(3 * 230 * 230)
        re_phase2_arr = re_phase2_arr.reshape(3 * 230 * 230)
        re_phase3_arr = re_phase3_arr.reshape(3 * 230 * 230)
        
        lstm_val_input[j] = np.array([re_phase1_arr,re_phase2_arr,re_phase3_arr])
        j += 1     
    
    phase1_neg_files = sorted(os.listdir(neg_phase1_full_path))
    phase2_neg_files = sorted(os.listdir(neg_phase2_full_path))
    phase3_neg_files = sorted(os.listdir(neg_phase3_full_path))
    
    # As illustrated above, the last 8 would be validation dataset 
    for neg_index in range(87, 96): 
        phase1_arr = np.load(neg_phase1_full_path + phase1_neg_files[neg_index])
        phase2_arr = np.load(neg_phase2_full_path + phase2_neg_files[neg_index])
        phase3_arr = np.load(neg_phase3_full_path + phase3_neg_files[neg_index])
        
        re_phase1_arr = np.resize(phase1_arr, (3, 230, 230))
        re_phase2_arr = np.resize(phase2_arr, (3, 230, 230))
        re_phase3_arr = np.resize(phase3_arr, (3, 230, 230))
        
        re_phase1_arr = re_phase1_arr.reshape(3 * 230 * 230)
        re_phase2_arr = re_phase2_arr.reshape(3 * 230 * 230)
        re_phase3_arr = re_phase3_arr.reshape(3 * 230 * 230)
        
        lstm_val_input[j] = np.array([re_phase1_arr,re_phase2_arr,re_phase3_arr])
        j += 1     
        
    print(j)
    return lstm_val_input
    

In [34]:
lstm_training_input = create_lstm_training_input()

162


In [37]:
lstm_training_input[161]

array([[ -990.,  -988.,  -987., ...,  -920.,  -864.,  -874.],
       [-1024., -1024., -1024., ..., -1010., -1008., -1000.],
       [-1024., -1024., -1024., ..., -1011., -1007., -1010.]])

In [38]:
lstm_input = lstm_training_input

In [52]:
lstm_val_input = create_lstm_val_input()

31


## Build LSTM Model

In [10]:
from keras.layers import Input, BatchNormalization, Dense, Flatten, Embedding
from keras.layers.recurrent import RNN, LSTM 
from keras.models import Model, Sequential

Using TensorFlow backend.
  return f(*args, **kwds)


In [53]:
model = Sequential()
print(lstm_input.shape)
num_samples = lstm_input.shape[0]
num_steps = lstm_input.shape[1]
num_features = lstm_input.shape[2]
num_classes = 2
pos_y = np.ones((95,))
neg_y = np.zeros((67,))
y_train = np.concatenate((pos_y,neg_y))
x_train = lstm_input
val_pos_y = np.ones((22,))
val_neg_y = np.zeros((9,))
y_val = np.concatenate((val_pos_y,val_neg_y))
x_val = lstm_val_input
model.add(LSTM(32, input_shape=(num_steps, num_features)))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, 
          batch_size=18, epochs=10, 
          validation_data=(x_val, y_val))

(162, 3, 158700)
Train on 162 samples, validate on 31 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd137a41ef0>

In [81]:
# Playground: Example from https://keras.io/getting-started/sequential-model-guide/
data_dim = 16
timesteps = 8
num_classes = 10

# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(32,# return_sequences=True,
               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
# model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
# model.add(LSTM(32))  # return a single vector of dimension 32
model.add(Dense(10, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# Generate dummy training data
x_train = np.random.random((1000, timesteps, data_dim))
y_train = np.random.random((1000, num_classes))

# Generate dummy validation data
x_val = np.random.random((100, timesteps, data_dim))
y_val = np.random.random((100, num_classes))

model.fit(x_train, y_train,
          batch_size=64, epochs=5,
          validation_data=(x_val, y_val))

Train on 1000 samples, validate on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8bd850bda0>