In [None]:
import pickle

import numpy as np
import pandas as pd

from configuration import Config, FilePaths

FILE_PATHS = FilePaths()
output_adjusted_timesteps: bool = True

In [None]:
rearranged_df_as_read = pd.read_csv(FILE_PATHS.REARRANGED_DATA, dtype=str)

In [None]:
rearranged_df = rearranged_df_as_read.copy()
rearranged_df.set_index("Patient Id", inplace=True)
rearranged_df = rearranged_df.dropna()
rearranged_df

In [None]:
def fit_data_to_timesteps(df, verbose: bool = True) -> list[list]:
    """Converts pandas datastep to timestream.

    Args:
        df (Dataframe): Patient dataframe, with each row being a patient
        verbose (bool, optional): _description_. Defaults to False.
    Returns:
        list[list]: _description_
    """
    patient_id_set: set = set(df.index)
    time_stamps: list = [str(x) for x in range(1, 365 + 1)]
    ehr_stream: list = []
    target_values: list = []


    for patient_id in patient_id_set:
        element = df.index.get_loc(patient_id)
        if verbose is True:
            print("-" * 56)
            print(f"Patient: {element}")
            print("-" * 56)
        for (colname, colvalue) in df.iloc[element].iteritems():
            if colvalue not in time_stamps:
                # boolean to include in the dataset
                include: bool = True
                input_start_timestep = int(colname)
                clamped_input_start_timestep = np.clip(input_start_timestep, 0, 365)
                input_end_timestep = int(colname) + int(Config.time_step)
                clamped_input_end_timestep = np.clip(input_end_timestep, 0, 365)

                target_start_timestep = np.clip(clamped_input_end_timestep + 1, 0, 365)
                clamped_target_start_timestep = np.clip(target_start_timestep, 0, 365)
                target_end_timestep = np.clip(clamped_input_end_timestep + 1 + Config.time_step, 0, 365)
                clamped_target_end_timestep = np.clip(target_end_timestep, 0, 365)

                row_slice = df.iloc[element]
                if verbose is True:
                    print(f"Visit found")
                    print(f"Input - Start Timestep: {clamped_input_start_timestep}")
                    print(f"Input - End timestep: { clamped_input_end_timestep}")
                    print(f"Target - Start timestep: {clamped_target_start_timestep}")
                    print(f"Target - End timestep: {clamped_target_end_timestep}")
                  

                data_stream = row_slice.iloc[
                    clamped_input_start_timestep : clamped_input_end_timestep
                ].to_list()
                
                target_stream = row_slice.iloc[clamped_target_start_timestep:clamped_target_end_timestep].to_list()

                # Pad timestreams to timestep length
                if len(data_stream) < Config.time_step:
                    diff: int = Config.time_step - len(data_stream)
                    pad_stream: list = [str("<pad>") for x in range(1, diff + 1)]
                    data_stream.extend(pad_stream)
                    print(f"Input too short - Timestep padded")
                    

                adjusted_target_stream = [int(s) if s.isdigit() else s for s in target_stream]
                print(f'The length of target stream: {len(adjusted_target_stream)}')
                target: bool = False
                if len(adjusted_target_stream) == 0:
                    include=False
                    target=False
                    print(f'No valid target time_stream (No values): Exluding from dataset')
                else:
                    target = all(isinstance(item, int) for item in adjusted_target_stream)
                print(f'Target: {target}')
                print("_" * 56)

                if include is True:
                    ehr_stream.append(data_stream)
                    target_values.append(target)
    return ehr_stream, target_values

ehr_stream, target_values = fit_data_to_timesteps(rearranged_df)


In [None]:
def add_time_since_last_vist(
    time_stream: list[list], verbose: bool = False
) -> list[list]:
    """Adjusts a timestream to count time since the last visit.
        i.e.
        from ['Visit', '1', '2', '3', 'Visit', '5', '6']
        to ['Visit', '1', '2', '3', 'Visit', '1', '2']

    Args:
        time_stream (list[list]): Unadjusted patient timestream
        verbose (bool, optional): Output information. Defaults to False.

    Returns:
        list[list]: Adjusted timestream
    """
    adjusted_time_stream: list = []
    for index, _ in enumerate(time_stream):
        inner_lst = [int(s) if s.isdigit() else s for s in ehr_stream[index]]
        adjusted_inner_lst: list = []
        count = 0
        for inner_index, inner_element in enumerate(inner_lst):
            if type(inner_element) == str:
                adjusted_inner_lst.append(inner_element)
                if verbose:
                    print(f"index of {inner_index} is a string, count reset")
                count = 0
            if count != 0:
                if verbose:
                    print(f"Times since last visit {count}")
                adjusted_inner_lst.append(str(count))
            count += 1
        adjusted_time_stream.append(adjusted_inner_lst)
    return adjusted_time_stream


if output_adjusted_timesteps:
    ehr_stream = add_time_since_last_vist(ehr_stream)

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
for i in range(0, 5):
    pp.pprint(i)
    pp.pprint(ehr_stream[i])
    pp.pprint(target_values[i])

In [None]:
ehr_stream = np.array(ehr_stream)
target_values = np.array(target_values)

In [None]:
# SAVE data using pickle
with open(FILE_PATHS.EHR_STREAM, "wb") as f:
    pickle.dump(ehr_stream, f)

with open(FILE_PATHS.TARGET_VALUES, "wb") as f:
    pickle.dump(target_values, f)
