In [25]:
from os.path import exists
import tqdm
from datetime import datetime
import string
import pandas as pd
import tensorflow as tf
from keras.layers import TextVectorization
from typing import Tuple
import numpy as np
import pandas
import tqdm
import pickle
import numpy
from sklearn.model_selection import train_test_split
import re

In [26]:
%run constants.py
tf.random.set_seed(
    SEED
)
np.random.seed(SEED)

In [27]:


def clamp(minimum: int, x: int, maximum: int):
    """Clamps an integer between a min/max"""
    return max(minimum, min(x, maximum))



class WindowGenerator:
    """
    Class to generate timestep'd data
    """

    def __init__(self, input_width: int, output_width: int, save_windows: bool):
        """Init Parmas
        Args:
            input_width (int): The timesteps forming the input sequence
            output_width (int): The timesteps forming the output sequence
        """
        self.input_width: int = input_width
        self.output_width: int = output_width
        self.total_window_size: int = input_width + output_width
        self.minimum_day_of_year: int = 0
        self.maximum_day_of_year: int = 365
        self.save_windows: bool = save_windows

    def window_datafile(
        self, data: pandas.DataFrame
    ) -> Tuple[np.ndarray, np.ndarray]:
        sequence: list = []
        labels: list = []
        for index, row in data.iterrows():
            for column in row.index[row.notnull()]:
                column = int(column)
                lower_bound = clamp(
                    self.minimum_day_of_year,
                    column - self.input_width,
                    self.maximum_day_of_year,
                )

                upper_bound = clamp(
                    0,
                    column + self.input_width,
                    self.maximum_day_of_year,
                )

                visit_index = column 

                input_sequence = data.iloc[index, lower_bound + 1 : visit_index]
                input_sequence = input_sequence.to_numpy()
                output_sequence = data.iloc[index, visit_index : upper_bound + 1]
                output_sequence = output_sequence.to_numpy()
                if len(input_sequence) < self.input_width:
                    input_sequence = self._pad_timeseries(sequence=input_sequence)
                if len(input_sequence) != TIME_STEP:
                    raise ValueError(
                        f"Input sequence has incorrect length :{len(input_sequence)} when compared to timestep window: {TIME_STEP -1}"
                    )
                sequence.append(input_sequence)

                label = self._categorize_output_sequence(
                    output_sequence=output_sequence
                )
                labels.append(label)
        if self.save_windows:
            self.save_frames(output_labels=np.array(labels), input_sequence=sequence)

        return sequence, np.array(labels)

    def _pad_timeseries(self, sequence):
        pad_nan_delta = self.input_width - len(sequence)
        if pad_nan_delta > 0:
            sequence = np.pad(
                sequence,
                (pad_nan_delta, 0),
                "constant",
                constant_values=EMPTY_TIMESTEP_TOKEN,
            )
        return sequence

    def save_frames(self, output_labels, input_sequence):
        print("------Saving windows for reuse ------")
        with open(REARRANGED_INPUT_WINDOWED_DATA_FILEPATH, "wb") as f:
            pickle.dump(input_sequence, f)
        with open(REARRANGED_INPUT_WINDOWED_LABEL_FILEPATH, "wb") as f:
            pickle.dump(output_labels, f)

    def _categorize_output_sequence(self, output_sequence: pandas.DataFrame) -> bool:
        """Categorise output sequence to binary
        Classification is based on if output sequence is not null in the output width
        Args:
            output_sequence (pandas.DataFrame): Sequence to classify
        Returns:
            bool: 0 = no revisit, 1 = revisit
        """
        try:
            np.isnan(np.sum(output_sequence))
            return 0
        except:
            return 1


def generate_windows(time_series_df):

    w1 = WindowGenerator(
                input_width=TIME_STEP, output_width=TIME_STEP, save_windows=True
            )
    loaded_dataset, loaded_labels = w1.window_datafile(time_series_df)
   
    print("------ Windowed Data Loaded ------")
    return loaded_dataset, loaded_labels


def vectorize_data_multi_timestep(text_vectorization, loaded_dataset):
    arr = numpy.array(loaded_dataset)
    arr[pd.isnull(arr)] = EMPTY_TIMESTEP_TOKEN
    input_samples = []
    for _, item in enumerate(
        tqdm.tqdm(arr, desc="Vectoring multi timestep"),
    ):
        time_seq = []
        for _, timestep in enumerate(item):
            time_seq.append(text_vectorization(timestep))
        input_samples.append(time_seq)
    test = numpy.array(input_samples)
    return test

def embed_vectors(text_vectorization):
    embeddings_index = {}

    f = open(GLOVE_300D_FILEPATH)
    for line in tqdm.tqdm(f, ncols=100, desc="Loading Glove Embeddings."):
        values = line.split()
        word = values[0]
        coefs = numpy.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
    f.close()

    print(f"Found {len(embeddings_index)} word vectors.")

    vocabulary = text_vectorization.get_vocabulary()
    word_index = dict(zip(vocabulary, range(len(vocabulary))))
    embedding_matrix = numpy.zeros((MAX_VOCAB_SIZE, EMBEDDING_DIM))

    for word, i in tqdm.tqdm(word_index.items(), desc="Embedding Matrix."):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [28]:
time_series_df = pd.read_csv(REARRANGED_DATA_FILEPATH)
time_series_df = time_series_df.iloc[: , 1:]
loaded_ds, loaded_labels = generate_windows(time_series_df)

In [None]:
df_test = pd.DataFrame(loaded_ds)

X_train, X_test, y_train, y_test = train_test_split(df_test, loaded_labels, test_size=TEST_TRAIN_SPLIT, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATION_SPLIT, random_state=SEED)

In [None]:
print(f"Training data shape:  {X_train.shape, y_train.shape} ")
print(f"Validation data shape:  {X_val.shape, y_val.shape} ")
print(f"Testing data shape:  {X_test.shape, y_test.shape} ")
with open(X_TRAIN_INPUT_SAVE_FILE_PRE_VEC, "wb") as f:
        pickle.dump(X_train, f)
with open(X_TEST_INPUT_SAVE_FILE_PRE_VEC, "wb") as f:
        pickle.dump(X_test, f)
with open(X_VAL_INPUT_SAVE_FILE_PRE_VEC, "wb") as f:
        pickle.dump(X_val, f)

In [None]:
len(flat_list_train_corpora)

In [None]:
def create_textvectorisation(lst):
    text_vectorization: TextVectorization = TextVectorization(
        output_mode="int",
        split="whitespace",
        max_tokens=MAX_VOCAB_SIZE,
        output_sequence_length=MAX_SEQUENCE_LENGTH,
    )
    text_vectorization.adapt(lst)
    return text_vectorization


def clean_df(df):
    no_uppercase = df.apply(lambda x: x.astype(str).str.lower()) 
    no_html = no_uppercase.replace(r'<[^<>]*>', '', regex=True)
    no_punctuation = no_html.replace(r'[^\w]', ' ', regex=True)
    no_digits = no_punctuation.replace(r'\w*\d\w*', ' ', regex=True)
    return no_digits

X_train = clean_df(X_train)
X_val = clean_df(X_val)
X_test = clean_df(X_test)

#Shave off the training corpora for fine tuning glove embeddings with it
train_corpora = X_train[29].str.split()
train_corpora = train_corpora.tolist()
flat_list_train_corpora = [x for xs in train_corpora for x in xs]
flat_list_train_corpora = list(set(flat_list_train_corpora))
flat_list_train_corpora
text_vectorization = create_textvectorisation(flat_list_train_corpora)

X_train_vec_ds = vectorize_data_multi_timestep(text_vectorization, X_train)
X_test_vec_ds = vectorize_data_multi_timestep(text_vectorization, X_test)
X_val_vec_ds = vectorize_data_multi_timestep(text_vectorization, X_val)
y_train = numpy.array(y_train)
y_test = numpy.array(y_test)
y_val = numpy.array(y_val)

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip

embedding_matrix = embed_vectors(text_vectorization)
vocab = text_vectorization.get_vocabulary()

In [None]:
print(f"Training data shape:  {X_train_vec_ds.shape, y_train.shape} ")
print(f"Validation data shape:  {X_val_vec_ds.shape, y_val.shape} ")
print(f"Testing data shape:  {X_test_vec_ds.shape, y_test.shape} ")

In [None]:
with open(X_TRAIN_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(X_train_vec_ds, f)
with open(Y_TRAIN_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(y_train, f)
with open(X_TEST_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(X_test_vec_ds, f)
with open(Y_TEST_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(y_test, f)
with open(X_VAL_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(X_val_vec_ds, f)
with open(Y_VAL_INPUT_SAVE_FILE, "wb") as f:
        pickle.dump(y_val, f)
with open(EMBEDDING_MATRIX_SAVE_FILE, "wb") as f:
        pickle.dump(embedding_matrix, f)
with open(VOCAB_SAVE_FILE, "wb") as f:
        pickle.dump(vocab, f)


In [None]:
TRAIN_CORPORA: str = os.path.join(DATA_DIR, "train_corpora.pkl")
with open(TRAIN_CORPORA, "wb") as f:
        pickle.dump(flat_list_train_corpora, f)