In [None]:
import pickle
import warnings
from os.path import exists
from datetime import datetime
import numpy
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from keras.layers import Embedding, TextVectorization
from functions.LSTMModel import LSTMModel
from functions.WindowGenerator import WindowGenerator

from constants import (
    BATCH_SIZE,
    DATA_FILEPATH,
    EMBEDDING_DIM,
    GLOVE_300D_FILEPATH,
    MAX_SEQUENCE_LENGTH,
    MAX_VOCAB_SIZE,
    NUM_EPOCHS,
    REARRANGED_DATA_FILEPATH,
    REARRANGED_SINGLE_INPUT_WINDOWED_DATA_FILEPATH,
    REARRANGED_SINGLE_INPUT_WINDOWED_LABEL_FILEPATH,
    TEST_TRAIN_SPLIT,
)

warnings.simplefilter(action="ignore", category=FutureWarning)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
AUTOTUNE = tf.data.AUTOTUNE


def main():

    """
    1 - Load datafile
    2 - Rearrange datafile if needed, if present skip
    3 - Create TextVectorization
    4 - Window Data
    5 - Section into test/train/split
    6 - Embed vectors
    7 - Fit and Train Model
    """

    """
    1. Load Datafile
    """

    if not exists(DATA_FILEPATH):
        raise ValueError("No datafile supplied.")

    for _ in tqdm(range(0, 100), ncols=100, desc="Loading data.."):
        df = pd.read_csv(DATA_FILEPATH, delimiter="\t", encoding="latin-1")
    print(f"------Loading {DATA_FILEPATH} is completed ------")

    doy = []  # Calc the day of the year for each entry in file
    for index in range(len(df)):
        d1 = datetime.strptime(df.iloc[index].date, "%Y-%m-%d %H:%M:%S")
        day_of_year = d1.timetuple().tm_yday  # returns 1 for January 1st
        doy.append(day_of_year)
    df["day_of_year"] = doy

    print(f"Total EHRs: {len(df.index)}")
    print(f"Average EHR character length: {df.ehr.apply(len).mean()}")

    """
    2. Create rearranged datafile, if needed
    """

    # New dataframe to hold changed data shape - want to have columns equal to every day of the year, with each row indicating a specific patient. EHR entries are located in each cell
    if not exists(REARRANGED_DATA_FILEPATH):
        doy = list(range(0, 365))  # Unsuprisingly, there are 365 days in a year
        ts_df = pd.DataFrame(
            columns=doy
        )  # add 365 day of year columns to the new dataframe
        max_patient_num: int = len(
            df.index
        )  # Assumption is that this is Z set i.e. {0, ..., 365}
        for i in tqdm(range(max_patient_num), desc="Rearranging patient data"):
            rows = df.loc[df.patient_id == i]
            for index, row in rows.iterrows():
                ts_df.at[i, row.day_of_year] = row.ehr
        print("------ Patient data restructuring is completed ------")
        ts_df.to_csv(REARRANGED_DATA_FILEPATH, index=False)

    time_series_df = pd.read_csv(REARRANGED_DATA_FILEPATH)

    """
    3. Create TextVectorization object
    """

    X_train_text = df.ehr
    text_vectorization: TextVectorization = TextVectorization(
        output_mode="int",
        split="whitespace",
        max_tokens=MAX_VOCAB_SIZE,
        output_sequence_length=MAX_SEQUENCE_LENGTH,
        
    )
    text_vectorization.adapt(X_train_text)

    """
    4. Window data with WindowGenerator
    """

    if not exists(REARRANGED_SINGLE_INPUT_WINDOWED_LABEL_FILEPATH) and not exists(
        REARRANGED_SINGLE_INPUT_WINDOWED_LABEL_FILEPATH
    ):
        w1 = WindowGenerator(input_width=30, output_width=30, save_windows=True)
        w1.window_multi_input_sequence(time_series_df)
    with open(REARRANGED_SINGLE_INPUT_WINDOWED_DATA_FILEPATH, "rb") as f:
        loaded_dataset = pickle.load(f)

    with open(REARRANGED_SINGLE_INPUT_WINDOWED_LABEL_FILEPATH, "rb") as f:
        loaded_labels = pickle.load(f)

    print("------ Windowed Data Loaded ------")

    """
    5. Section Data into test/train/split
    """

    # print(loaded_dataset.shape)
    return loaded_dataset, text_vectorization
loaded_dataset, tv =main()

In [None]:


arr = numpy.array(loaded_dataset)
arr[pd.isnull(arr)] = '<EMPTY>'
input_samples = []
for index, item in enumerate(arr):
    # print(f"seqeunce : {index}")
    # print(f"Seq Len : {len(item)}")
    time_seq = []
    for timestep_index, timestep in enumerate(item): 
        # print(f"Timestep index: {timestep_index}")
        # print(f"Timestep Value: {timestep}")
        # print(f"Timestep Value: {tv(timestep)}")
        time_seq.append(tv(timestep))
    input_samples.append(time_seq)

In [None]:
test = numpy.array(input_samples)

In [None]:
test.shape

In [None]:
vocab = tv.get_vocabulary()

In [None]:

embeddings_index = {}

f = open(GLOVE_300D_FILEPATH, encoding="UTF-8")
for line in tqdm(f, ncols=100, desc="Loading Glove Embeddings."):
        values = line.split()
        word = values[0]
        coefs = numpy.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
f.close()

print(f"Found {len(embeddings_index)} word vectors.")




vocabulary = tv.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = numpy.zeros((MAX_VOCAB_SIZE, EMBEDDING_DIM))

for word, i in tqdm(word_index.items(), desc="Embedding Matrix."):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

"""
7. Fit and train models
"""

embedding_layer = Embedding(
        MAX_VOCAB_SIZE,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False,
    )


In [12]:
from keras.layers import TimeDistributed, Bidirectional, LSTM, Embedding
import tensorflow as tf
from constants import (
    BATCH_SIZE,
    DATA_FILEPATH,
    EMBEDDING_DIM,
    GLOVE_300D_FILEPATH,
    MAX_SEQUENCE_LENGTH,
    MAX_VOCAB_SIZE,
    NUM_EPOCHS,
    REARRANGED_DATA_FILEPATH,
    REARRANGED_SINGLE_INPUT_WINDOWED_DATA_FILEPATH,
    REARRANGED_SINGLE_INPUT_WINDOWED_LABEL_FILEPATH,
    TEST_TRAIN_SPLIT,
)

inp = tf.keras.Input(shape=(30, MAX_SEQUENCE_LENGTH))    
x = TimeDistributed(Embedding(200, 300)(x))

# embedding_layer = Embedding(
#         MAX_VOCAB_SIZE,
#         EMBEDDING_DIM,
#         input_length=MAX_SEQUENCE_LENGTH,
#         trainable=False,
#     )

# x = TimeDistributed(embedding_layer)(x)

# #x1 shape : (batch, article_num, word_num, 50)
# x1 = TimeDistributed(Bidirectional(LSTM(50, return_sequences = True)))(x)


AttributeError: Exception encountered when calling layer "embedding_6" (type Embedding).

'str' object has no attribute 'base_dtype'

Call arguments received:
  • inputs=<keras.layers.wrappers.TimeDistributed object at 0x000002D837D25450>

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
embedding_sequences = embedding_layer(sequence_input)

        x = SpatialDropout1D(0.2)(embedding_sequences)
        x = Conv1D(64, 5, activation="relu")(x)
        x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x)
        x = Dense(512, activation="relu")(x)
        x = Dropout(0.5)(x)
        x = Dense(512, activation="relu")(x)
        outputs = Dense(1, activation="sigmoid")(x)
        model = Model(sequence_input, outputs)
        return model