In [None]:
from os.path import exists
import tqdm
from datetime import datetime
import string
import pandas as pd
import tensorflow as tf
from keras.layers import TextVectorization
from typing import Tuple
import numpy as np
import pandas
import tqdm
import pickle
import numpy
from sklearn.model_selection import train_test_split



In [None]:
%run constants.py
tf.random.set_seed(
    SEED
)
np.random.seed(SEED)


In [None]:
def load_datafile():
    if not exists(DATA_FILEPATH):
        raise ValueError("No datafile supplied.")

    for _ in tqdm.tqdm(range(0, 100), ncols=100, desc="Loading data.."):
        df = pd.read_csv(DATA_FILEPATH, delimiter="\t", encoding="latin-1")
    print(f"------Loading {DATA_FILEPATH} is completed ------")

    doy = []  # Calc the day of the year for each entry in file
    for index in range(len(df)):
        d1 = datetime.strptime(df.iloc[index].date, "%Y-%m-%d %H:%M:%S")
        day_of_year = d1.timetuple().tm_yday  # returns 1 for January 1st
        doy.append(day_of_year)
    df["day_of_year"] = doy

    print(f"Total EHRs: {len(df.index)}")
    print(f"Average EHR character length: {df.ehr.str.split().apply(len).mean()}")
    return df

def refactor_dataframe(df):
    # New dataframe to hold changed data shape - want to have columns equal to every day of the year, with each row indicating a specific patient. EHR entries are located in each cell
    doy = list(range(0, 365))  # Unsuprisingly, there are 365 days in a year
    ts_df = pd.DataFrame(
            columns=doy
        )  # add 365 day of year columns to the new dataframe
    max_patient_num: int = len(
            df.index
        )  # Assumption is that this is Z set i.e. {0, ..., 365}
    for i in tqdm.tqdm(range(max_patient_num), desc="Rearranging patient data"):
            rows = df.loc[df.patient_id == i]
            for index, row in rows.iterrows():
                ts_df.at[i, row.day_of_year] = row.ehr
    print("------ Patient data restructuring is completed ------")
    ts_df.to_csv(REARRANGED_DATA_FILEPATH, index=False)

    time_series_df = pd.read_csv(REARRANGED_DATA_FILEPATH)
    return time_series_df

def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    no_uppercased = tf.strings.lower(input_string, encoding='utf-8')
    no_stars = tf.strings.regex_replace(no_uppercased, "\*", " ")
    no_repeats = tf.strings.regex_replace(no_stars, "devamını oku", "")    
    no_html = tf.strings.regex_replace(no_repeats, "<br />", "")
    no_digits = tf.strings.regex_replace(no_html, "\w*\d\w*","")
    no_punctuations = tf.strings.regex_replace(no_digits, f"([{string.punctuation}])", r" ")

    return no_punctuations
    

def clamp(minimum: int, x: int, maximum: int):
    """Clamps an integer between a min/max"""
    return max(minimum, min(x, maximum))



class WindowGenerator:
    """
    Class to generate timestep'd data
    """

    def __init__(self, input_width: int, output_width: int, save_windows: bool):
        """Init Parmas
        Args:
            input_width (int): The timesteps forming the input sequence
            output_width (int): The timesteps forming the output sequence
        """
        self.input_width: int = input_width
        self.output_width: int = output_width
        self.total_window_size: int = input_width + output_width
        self.minimum_day_of_year: int = 0
        self.maximum_day_of_year: int = 365
        self.save_windows: bool = save_windows

    def window_datafile(
        self, data: pandas.DataFrame
    ) -> Tuple[np.ndarray, np.ndarray]:
        data = data.head(100)
        sequence: list = []
        labels: list = []
        # non_null_indexes = list(
        #     zip(*np.where(data.notnull()))
        # )  # Get indexes of df where values which are not null

        for index, row in data.iterrows():
            for column in row.index[row.notnull()]:
                column = int(column)
                lower_bound = clamp(
                    self.minimum_day_of_year,
                    column - self.input_width,
                    self.maximum_day_of_year,
                )

                upper_bound = clamp(
                    0,
                    column + self.input_width,
                    self.maximum_day_of_year,
                )

                visit_index = column + 1

                input_sequence = data.iloc[index, lower_bound + 1 : visit_index]
                input_sequence = input_sequence.to_numpy()
                test_input = input_sequence

                output_sequence = data.iloc[index, visit_index : upper_bound + 1]
                output_sequence = output_sequence.to_numpy()
                if len(input_sequence) < self.input_width:
                    input_sequence = self._pad_timeseries(sequence=input_sequence)
                if len(input_sequence) != TIME_STEP:
                    raise ValueError(
                        f"Input sequence has incorrect length :{len(input_sequence)} when compared to timestep window: {TIME_STEP -1}"
                    )
                sequence.append(input_sequence)

                label = self._categorize_output_sequence(
                    output_sequence=output_sequence
                )
                labels.append(label)
        if self.save_windows:
            self.save_frames(output_labels=np.array(labels), input_sequence=sequence)

        return sequence, np.array(labels)

    def _pad_timeseries(self, sequence):
        pad_nan_delta = self.input_width - len(sequence)
        if pad_nan_delta > 0:
            sequence = np.pad(
                sequence,
                (pad_nan_delta, 0),
                "constant",
                constant_values=EMPTY_TIMESTEP_TOKEN,
            )
        return sequence

    def save_frames(self, output_labels, input_sequence):
        print("------Saving windows for reuse ------")
        with open(REARRANGED_INPUT_WINDOWED_DATA_FILEPATH, "wb") as f:
            pickle.dump(input_sequence, f)
        with open(REARRANGED_INPUT_WINDOWED_LABEL_FILEPATH, "wb") as f:
            pickle.dump(output_labels, f)

    def _categorize_output_sequence(self, output_sequence: pandas.DataFrame) -> bool:
        """Categorise output sequence to binary
        Classification is based on if output sequence is not null in the output width
        Args:
            output_sequence (pandas.DataFrame): Sequence to classify
        Returns:
            bool: 0 = no revisit, 1 = revisit
        """
        try:
            np.isnan(np.sum(output_sequence))
            return 0
        except:
            return 1


def generate_windows(time_series_df):

    w1 = WindowGenerator(
                input_width=TIME_STEP, output_width=TIME_STEP, save_windows=True
            )
    loaded_dataset, loaded_labels = w1.window_datafile(time_series_df)
   
    print("------ Windowed Data Loaded ------")
    return loaded_dataset, loaded_labels


def vectorize_data_multi_timestep(text_vectorization, loaded_dataset):
    arr = numpy.array(loaded_dataset)
    arr[pd.isnull(arr)] = EMPTY_TIMESTEP_TOKEN
    input_samples = []
    for _, item in enumerate(
        tqdm.tqdm(arr, desc="Vectoring multi timestep"),
    ):
        time_seq = []
        for _, timestep in enumerate(item):
            time_seq.append(text_vectorization(timestep))
        input_samples.append(time_seq)
    test = numpy.array(input_samples)
    return test

def embed_vectors(text_vectorization):
    embeddings_index = {}

    f = open(GLOVE_300D_FILEPATH)
    for line in tqdm.tqdm(f, ncols=100, desc="Loading Glove Embeddings."):
        values = line.split()
        word = values[0]
        coefs = numpy.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
    f.close()

    print(f"Found {len(embeddings_index)} word vectors.")

    vocabulary = text_vectorization.get_vocabulary()
    word_index = dict(zip(vocabulary, range(len(vocabulary))))
    embedding_matrix = numpy.zeros((MAX_VOCAB_SIZE, EMBEDDING_DIM))

    for word, i in tqdm.tqdm(word_index.items(), desc="Embedding Matrix."):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [4]:
df = load_datafile()

if not LOAD_FROM_SAVE: 
    time_series_df = refactor_dataframe(df)
    loaded_ds, loaded_labels = generate_windows(time_series_df)
else:
    with open(REARRANGED_INPUT_WINDOWED_DATA_FILEPATH, "rb") as f:
        loaded_ds = pickle.load(f)
    with open(REARRANGED_INPUT_WINDOWED_LABEL_FILEPATH, "rb") as f:
        loaded_labels = pickle.load(f)

Loading data..: 100%|█████████████████████████████████████████████| 100/100 [00:11<00:00,  8.76it/s]


------Loading /home/aaron/timeseries_nlp/data/data.csv is completed ------
Total EHRs: 23907
Average EHR character length: 99.45509683356339


In [None]:
df_test = pd.DataFrame(loaded_ds)

X_train, X_test, y_train, y_test = train_test_split(df_test, loaded_labels, test_size=TEST_TRAIN_SPLIT, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATION_SPLIT, random_state=SEED)

In [None]:
print(f"Training data shape:  {X_train.shape, y_train.shape} ")
print(f"Validation data shape:  {X_val.shape, y_val.shape} ")
print(f"Testing data shape:  {X_test.shape, y_test.shape} ")
with open(X_TRAIN_INPUT_SAVE_FILE_PRE_VEC, "wb") as f:
        pickle.dump(X_train, f)
with open(X_TEST_INPUT_SAVE_FILE_PRE_VEC, "wb") as f:
        pickle.dump(X_test, f)
with open(X_VAL_INPUT_SAVE_FILE_PRE_VEC, "wb") as f:
        pickle.dump(X_val, f)

In [None]:
#Shave off the training corpora for fine tuning glove embeddings with it
train_corpora = X_train[29].str.split()
train_corpora = train_corpora.tolist()
flat_list_train_corpora = [x for xs in train_corpora for x in xs]
flat_list_train_corpora = list(set(flat_list_train_corpora))
flat_list_train_corpora

In [None]:
len(flat_list_train_corpora)

In [None]:
def create_textvectorisation(lst):
    text_vectorization: TextVectorization = TextVectorization(
        output_mode="int",
        split="whitespace",
        max_tokens=MAX_VOCAB_SIZE,
        output_sequence_length=MAX_SEQUENCE_LENGTH,
        standardize=custom_standardization
    )
    text_vectorization.adapt(lst)
    return text_vectorization


text_vectorization = create_textvectorisation(flat_list_train_corpora)

X_train_vec_ds = vectorize_data_multi_timestep(text_vectorization, X_train)
X_test_vec_ds = vectorize_data_multi_timestep(text_vectorization, X_test)
X_val_vec_ds = vectorize_data_multi_timestep(text_vectorization, X_val)
y_train = numpy.array(y_train)
y_test = numpy.array(y_test)
y_val = numpy.array(y_val)

In [None]:
embedding_matrix = embed_vectors(text_vectorization)
vocab = text_vectorization.get_vocabulary()

In [None]:
print(f"Training data shape:  {X_train_vec_ds.shape, y_train.shape} ")
print(f"Validation data shape:  {X_val_vec_ds.shape, y_val.shape} ")
print(f"Testing data shape:  {X_test_vec_ds.shape, y_test.shape} ")

In [None]:
with open(X_TRAIN_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(X_train_vec_ds, f)
with open(Y_TRAIN_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(y_train, f)
with open(X_TEST_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(X_test_vec_ds, f)
with open(Y_TEST_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(y_test, f)
with open(X_VAL_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(X_val_vec_ds, f)
with open(Y_VAL_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(y_val, f)
with open(EMBEDDING_MATRIX_SAVE_FILE, "wb") as f:
        pickle.dump(embedding_matrix, f)
with open(VOCAB_SAVE_FILE, "wb") as f:
        pickle.dump(vocab, f)


In [None]:
#Pre training GLOVE on the training data..!?!

import csv
from mittens import Mittens
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed


pre_glove = glove2dict(GLOVE_300D_FILEPATH)

sw = list(_stop_words.ENGLISH_STOP_WORDS)

train_corpora_for_glove = [token.lower() for token in flat_list_train_corpora if (token.lower() not in sw)]
oov = [token for token in flat_list_train_corpora if token not in pre_glove.keys()]


corp_vocab = list(set(oov))
brown_doc = [' '.join(train_corpora_for_glove)]


cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X = cv.fit_transform(brown_doc)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

mittens_model = Mittens(n=EMBEDDING_DIM, max_iter=1000)

new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict= pre_glove)

newglove = dict(zip(corp_vocab, new_embeddings))
f = open(FINE_TUNED_GLOVE_300D_FILEPATH,"wb")
pickle.dump(newglove, f)
f.close()
for key, value in newglove.items():
     print(key, '->', value)