# LSTM Multiphase Model Training

Note: Make sure to run the notebook in virtualenv. 

## Loading the Data
The code below loads data and labels from `/research/rih-cs/datasets/elvo-multiphase`.

Each phase data is stored under `/research/rih-cs/datasets/elvo-multiphase/preprocessed`.

In [1]:
import os
import pathlib
import typing

import numpy as np

In [2]:
import  logging

def configure_logger():
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    root_logger.addHandler(handler)

## Train / Test / Val Split
We will iterate through the pos and neg directory of phase1 to get the index of our train/test/val set. 


In [3]:
TRAIN_DATA = ['P25', 'P48', 'P62', 'P72', 'P144', 'P149', 'P1', 'P4', 'P16', 'P21', 'P32', \
'P36', 'P38', 'P52', 'P59', 'P88', 'P89', 'P118', 'P164', 'P232', 'P255', 'P266', 'P280', \
'P289', 'P73', 'P78', 'P120', 'P142', 'P126', 'P145', 'P147', 'P3', 'P5', 'P6', 'P15', \
'P17', 'P22', 'P28', 'P29', 'P34', 'P57', 'P58', 'P61', 'P66', 'P68', 'P70', 'P77', 'P80', \
'P85', 'P87', 'P94', 'P102', 'P106', 'P107', 'P110', 'P125', 'P127', 'P130', 'P134', 'P135', \
'P141', 'P150', 'P152', 'P153', 'P158', 'P163', 'P166', 'P179', 'P180', 'P181', 'P182', 'P185', \
'P207', 'P209', 'P210', 'P216', 'P218', 'P222', 'P224', 'P225', 'P231', 'P8', 'P13', 'P18', \
'P24', 'P33', 'P40', 'P43', 'P44', 'P47', 'P51', 'P53', 'P56', 'P63', 'P67', 'P69', 'P81', \
'P100', 'P101', 'P111', 'P117', 'P124', 'P146', 'P168', 'P184', 'P187', 'P188', 'P208', 'P212', \
'P248', 'P112', 'P2', 'P10', 'P20', 'P26', 'P46', 'P60', 'P79', 'P93', 'P95', 'P98', 'P116', 'P121', \
'P136', 'P143', 'P148', 'P160', 'P189', 'P203', 'P71', 'P97', 'P140', 'P84', 'P92', 'P131', 'P7', \
'P42', 'P129', 'P137', 'P154', 'P159', 'P176', 'P201', 'P213', 'P9', 'P11', 'P12', 'P19', 'P23', \
'P27', 'P30', 'P31', 'P35', 'P39', 'P45', 'P54', 'P55', 'P64', 'P65', 'P74', 'P91', 'P96', 'P99', \
'P104', 'P105', 'P108', 'P109', 'P113', 'P114', 'P119', 'P122', 'P123', 'P128', 'P132', 'P133', 'P139', \
'P151', 'P155', 'P156', 'P157', 'P165', 'P169', 'P173', 'P174', 'P177', 'P183', 'P186', 'P190', 'P192', \
'P193', 'P194', 'P197', 'P199', 'P200', 'P202', 'P205', 'P14', 'P41', 'P49', 'P75', 'P83', 'P86', 'P90', \
'P103', 'P167', 'P171', 'P196', 'P198', 'P204', 'P214', 'P254', 'P191'] 

TEST_DATA = ['P252', 'P265', 'P162', 'P170', 'P172', 'P178', 'P195', 'P221', 'P253', 'P234', 'P236', 'P237', \
'P241', 'P262', 'P272', 'P277', 'P282', 'P284', 'P285', 'P288', 'P291', 'P293', 'P296', 'P220', 'P228', \
'P246', 'P250', 'P270', 'P273', 'P283', 'P302', 'P268', 'P292', 'P226', 'P245', 'P263', 'P269', 'P286', \
'P217', 'P219', 'P233', 'P244', 'P206', 'P211', 'P215', 'P223', 'P227', 'P235', 'P243', 'P257', 'P258', \
'P260', 'P261', 'P267', 'P275', 'P278', 'P264', 'P274', 'P276', 'P279', 'P242']

VAL_DATA = ['P271', 'P259', 'P238', 'P281', 'P229', 'P240', 'P297', 'P309', 'P310', 'P50', 'P76', 'P230', \
'P304', 'P305', 'P306', 'P307', 'P308', 'P300', 'P290', 'P298', 'P299', 'P249', 'P239', 'P294', 'P301', \
'P303', 'P161', 'P256', 'P37', 'P287', 'P295', 'P82', 'P247'] 

In [4]:
data_path = '/research/rih-cs/datasets/elvo-multiphase/preprocessed/'

In [5]:
LENGTH, WIDTH, HEIGHT = (3, 230, 230)
TRAIN_INDICES = []
TEST_INDICES = []
VAL_INDICES = []

# Usage: np.stack(train_arrays)
train_arrays = []
test_arrays = []
val_arrays = []

def load_training_data(): 
    """
    Returns 4D matrix of training data
    Data is in the form (n_samples, 1, w, h). 
    Samples are sorted respectively according to the specs in TRAIN_DATA, TEST_DATA, VAL_DATA
    """

    phase1_pos_files = sorted(os.listdir(data_path + 'phase1/pos/'))
    for i, filename in enumerate(phase1_pos_files):
        arr = np.load(data_path + 'phase1/pos/' + filename)
        if arr.shape == (LENGTH, WIDTH, HEIGHT):
            matching_name = os.path.splitext(filename)[0] 
            if matching_name in TRAIN_DATA:
                train_arrays.append(arr)
                TRAIN_INDICES.append(i)
            elif matching_name in TEST_DATA: 
                test_arrays.append(arr)
                TEST_INDICES.append(i)
            elif matching_name in VAL_DATA: 
                val_arrays.append(arr)
                VAL_INDICES.append(i)
            else: 
                logging.info(
                f'training file {filename}, {matching_name} is not found.')
        else:
            logging.info(
                f'training file {filename} has incorrect shape {arr.shape}')

In [11]:
load_training_data()
# 12/02/2018 has 406 positive dataset

In [39]:
# check the indices 
TRAIN_INDICES

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 27,
 29,
 32,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 46,
 47,
 48,
 49,
 51,
 52,
 53,
 54,
 55,
 56,
 59,
 60,
 62,
 65,
 82,
 99,
 100,
 114,
 116,
 117,
 120,
 121,
 124,
 125,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 141,
 142,
 143,
 144,
 146,
 147,
 148,
 149,
 151,
 152,
 153,
 154,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 27,
 29,
 32,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 46,
 47,
 48,
 49,
 51,
 52,
 53,
 54,
 55,
 56,
 59,
 60,
 62,
 65,
 82,
 99,
 100,
 114,
 116,
 117,
 120,
 121,
 124,
 125,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 141,
 142,
 143,
 144,
 146,
 147,
 148,
 149,
 151,
 152,
 153,
 154]

In [70]:
# check the shape of train array output
train_shape = np.stack(train_arrays).shape
n_train = train_shape[0]
train_shape

(172, 3, 230, 230)

## Processing into one input for LSTM

In [59]:
# How would the data after concatenation look like
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7, 8]])
c = np.array([[15, 16], [17, 18]])

np.concatenate((a, b, c), axis=1)

array([[ 1,  2,  5,  6, 15, 16],
       [ 3,  4,  7,  8, 17, 18]])

In [72]:
# TODO: set up the multiple (three) parallel phases as input for the LSTM model 
# Doc: https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/
# For example, TRAIN_INDICES has train indices for each phase; train_arrays has values in each phase. 
# To merge them into LSTM's input, do the following steps:
# 1. define these data as a matrix of 3 columns (phases) with n rows 
# 2. data = data.reshape(1, n, 3)
# 3. Check the shape by print(data.shape)

# train shape = (172, 3, 230, 230)
lstm_input = np.zeros((172, 3))

phase1_full_path = data_path + 'phase1/pos/'
phase2_full_path = data_path + 'phase2/pos/'
phase3_full_path = data_path + 'phase3/pos/'

def create_lstm_training_input(): 
    phase1_pos_files = sorted(os.listdir(phase1_full_path))
    phase2_pos_files = sorted(os.listdir(phase2_full_path))
    phase3_pos_files = sorted(os.listdir(phase3_full_path))
    
    i = 0 
    for index in TRAIN_INDICES: 
        phase1_arr = np.load(phase1_full_path + phase1_pos_files[index])
        print(phase1_arr.shape)
        phase2_arr = np.load(phase2_full_path + phase2_pos_files[index])
        print(phase2_arr.shape)
        phase3_arr = np.load(phase3_full_path +phase3_pos_files[index])
        print(phase3_arr.shape)

        # TODO: ValueError: all the input array dimensions except for the concatenation axis must match exactly
        # For example: for P100.npy, (3, 230, 230), (3, 197, 174), (3, 199, 174)
        lstm_input[i] = np.concatenate((phase1_arr, phase2_arr, phase3_arr), axis=1)
        i += 1 
   
    # reshape to (1, ... )
    lstm_training_input = lstm_input.reshape(1, n_train, 3) 
    
    return lstm_training_input
    

In [73]:
lstm_training_input = create_lstm_training_input()

(3, 230, 230)
(3, 197, 174)
(3, 199, 174)


ValueError: all the input array dimensions except for the concatenation axis must match exactly

## Build LSTM Model

In [9]:
from keras.layers import Input, BatchNormalization, Dense, Flatten, Embedding
from keras.layers.recurrent import RNN, LSTM 
from keras.models import Model, Sequential

Using TensorFlow backend.
  return f(*args, **kwds)


In [10]:
model = Sequential()
model.add(LSTM(32, input_shape=(n_train, 3)))
model.add(Dense(1))

NameError: name 'n' is not defined