<a href="https://colab.research.google.com/github/anhle/AI-Healthcare/blob/master/AI_EHR/Ex/Lesson_3_lesson_concepts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This the code for walking through the lesson examples for your reference.

## Code for Building Synthetic Dataset

In [0]:
import pandas as pd
import numpy as np

In [0]:
#build synthetic line level example
NUMBER_RECORDS = 100000
NUMBER_ENCOUNTERS = 7800
NUMBER_PATIENTS = 1000

In [0]:
# Create random list of code sets for diagnosis, procedure, medication, and lab codes
dx_code_list = ["dx_code_" + str(x) for x in np.arange(1,100000)]
procedure_code_list =["procedure_code_" + str(x) for x in np.arange(0,73000)]
medication_code_list = ["medication_code_" + str(x) for x in np.arange(0,10000)]
lab_code_list = ["lab_code_" + str(x) for x in np.arange(0,10000)]

In [0]:
patient_id_list = ["udacity_health_patient_id_" + str(x) for x in np.arange(1, NUMBER_PATIENTS +1)]
encounter_id_list = ["udacity_health_encounter_id_" + str(x) for x in np.arange(1, NUMBER_ENCOUNTERS +1)]

In [0]:
def random_value_selection(field_value_list, number_records):
    #build normal probability distribution 
    field_prob_dist = np.random.dirichlet(np.ones(len(field_value_list)), size=1)[0] 
    #build random value list for field
    field_random_values = np.random.choice(field_value_list, number_records, p=field_prob_dist)
    return field_random_values

In [0]:
#patient_values = random_value_selection(patient_id_list, NUMBER_RECORDS)
encounter_values = random_value_selection(encounter_id_list, NUMBER_RECORDS)

In [0]:
encounter_patient_mapping = dict(zip(encounter_id_list,   random_value_selection(patient_id_list, NUMBER_ENCOUNTERS)))
patient_values = [encounter_patient_mapping[x] for x in encounter_values]

In [0]:
dx_value_mapping = dict(zip(encounter_id_list, random_value_selection(dx_code_list, NUMBER_ENCOUNTERS) ))
dx_values = [dx_value_mapping[x] for x in encounter_values ]

In [0]:
procedure_values = random_value_selection(procedure_code_list, NUMBER_RECORDS)
medication_values = random_value_selection(medication_code_list, NUMBER_RECORDS)
lab_values = random_value_selection(lab_code_list, NUMBER_RECORDS)

In [0]:
triplet_prob_choice = np.random.choice([0, 1, 2], NUMBER_RECORDS, p= np.random.dirichlet(np.ones(3), size=1)[0] )
line_triplet_values = list(zip(procedure_values, medication_values, lab_values, triplet_prob_choice))
selected_procedure_values = [x[0] if x[3] == 0 else np.nan for x in line_triplet_values ]
selected_medication_values = [x[1] if x[3] == 1 else np.nan for x in line_triplet_values]
selected_lab_values = [x[2] if x[3] == 2 else np.nan for x in line_triplet_values]

In [0]:
#add label
patient_label_mapping = dict(zip( patient_id_list, np.random.choice([0, 1], NUMBER_PATIENTS, replace=True, 
                                                                    p=[0.88, 0.12]) ))
label_values = [patient_label_mapping[x] for x in patient_values]

In [0]:
line_df = pd.DataFrame({ "ENCOUNTER_ID": encounter_values,
                        "PATIENT_ID": patient_values,
                        "PRINCIPAL_DIAGNOSIS_CODE": dx_values,
                        "PROCEDURE_CODE": selected_procedure_values,
                        "MEDICATION_CODE": selected_medication_values,
                        "LAB_CODE": selected_lab_values,
                        "LABEL": label_values
                       })

In [0]:
#line_df.to_csv("./data/SYNTHETIC_EHR_DATASET.csv", index=False)

## 1. Converting Line to Encounter Representation

### Load Synthetic EHR Line Dataset

In [0]:
ehr_line_df = pd.read_csv("https://raw.githubusercontent.com/anhle/AI-Healthcare/master/AI_EHR/Ex/data/SYNTHETIC_EHR_DATASET.csv")

In [0]:
ehr_line_df.head()

In [0]:
ehr_line_df[ehr_line_df['ENCOUNTER_ID']=='udacity_health_encounter_id_100']

In [0]:
#note that this is for illustrative purposes only and for practicing key skills, 
# the actual data representation and combinations of codes not indicative of real thing

### Convert Line to Encounter Representation

In [0]:
# grouping fields 
grouping_field_list = ['ENCOUNTER_ID', 'PATIENT_ID', 'PRINCIPAL_DIAGNOSIS_CODE']
non_grouped_field_list = [c for c in ehr_line_df.columns if c not in grouping_field_list]

Create a new dataframe that groups the data by "patient_id". Again you can use groupby() and agg() methods.

In [0]:
encounter_df = ehr_line_df.groupby(grouping_field_list)[non_grouped_field_list].agg(lambda x: 
                                                        list([y for y in x if y is not np.nan ] ) ).reset_index()

In [0]:
encounter_df[0:5]

Inspect and compare the data again by selecting a single patient and compare the "encounter", "principal_diagnosis", and "procedure_codes". You should see all of these codes represented in arrays/lists for each patient.

In [0]:
ehr_line_df[ehr_line_df['ENCOUNTER_ID']=='udacity_health_encounter_id_1']

In [0]:
encounter_df[encounter_df['ENCOUNTER_ID']=='udacity_health_encounter_id_1']

## 2. Converting Encounter to Longitudinal Representation

In [0]:
encounter_df.head()

In [0]:
patient_grouping_field_list = ["PATIENT_ID"]
non_patient_agg_field_list = [c for c in encounter_df.columns if c not in patient_grouping_field_list]

In [0]:
long_df = encounter_df.groupby(patient_grouping_field_list)[non_patient_agg_field_list].agg(lambda x: 
                                                        list([y for y in x if y is not np.nan ] ) ).reset_index()

In [0]:
long_df.head()

In [0]:
example_patient_history = long_df[long_df['PATIENT_ID']=='udacity_health_patient_id_310']

In [0]:
example_patient_history

In [0]:
list(example_patient_history['ENCOUNTER_ID'].values)

In [0]:
list(example_patient_history['PRINCIPAL_DIAGNOSIS_CODE'].values)

In [0]:
list(example_patient_history['PROCEDURE_CODE'].values)

## 3. How to Split Dataset at Patient Level

#### ***Objective:*** 
- Split dataset at patient level into train and test partitions
- Validate that split was done correctly

#### Dataset Splitting Tests
- Patient data in only one partition
- Total unique number of patients across all partitions = total number unique patients in original full dataset
- Total number of rows original dataset = sum of rows across splits

In [0]:
PATIENT_ID_FIELD = 'PATIENT_ID'
TEST_PERCENTAGE = 0.2

In [0]:
def split_dataset_patient_level(df, key, test_percentage=0.2):
    df = df.iloc[np.random.permutation(len(df))]
    unique_values = df[key].unique()
    total_values = len(unique_values)
    sample_size = round(total_values * (1 - test_percentage ))
    train = df[df[key].isin(unique_values[:sample_size])].reset_index(drop=True)
    test = df[df[key].isin(unique_values[sample_size:])].reset_index(drop=True)
    return train, test

In [0]:
train_df, test_df = split_dataset_patient_level(encounter_df, PATIENT_ID_FIELD, TEST_PERCENTAGE)

In [0]:
assert len(set(train_df[PATIENT_ID_FIELD].unique()).intersection(set(test_df[PATIENT_ID_FIELD].unique()))) == 0
print("Test passed for patient data in only one partition")

In [0]:
assert (train_df[PATIENT_ID_FIELD].nunique()  + test_df[PATIENT_ID_FIELD].nunique()) == encounter_df[PATIENT_ID_FIELD].nunique()
print("Test passed for number of unique patients being equal!")

In [0]:
assert len(train_df)  + len(test_df) == len(encounter_df)
print("Test passed for number of total rows equal!")

## 4. ETL with TF Dataset API and Pandas

NOTE: In some cases you may need to preprocess Pandas Dataframe to removed mixed types. In particular, remove null values and impute or remove rows (we will later impute with zero for numerical features).

In [0]:
import tensorflow as tf

In [0]:
swiss_dataset_path = "./data/processed_swiss.csv"
swiss_df = pd.read_csv(swiss_dataset_path)
selected_col_list = ['age', 'thalach', 'cp', 'num_label']
subset_swiss_df = swiss_df[selected_col_list]

In [0]:
swiss_df.head()

In [0]:
subset_swiss_df.head()

In [0]:
#adapted from https://www.tensorflow.org/tutorials/structured_data/feature_columns
def df_to_dataset(df, predictor,  batch_size=32):
    df = df.copy()
    labels = df.pop(predictor)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds

In [0]:
BATCH_SIZE = 64
PREDICTOR_FIELD = 'num_label'
sample_tf_ds = df_to_dataset(subset_swiss_df, PREDICTOR_FIELD, batch_size=BATCH_SIZE)

In [0]:
sample_feature_batch = next(iter(sample_tf_ds))[0]
sample_feature_batch

In [0]:
sample_label_batch = next(iter(sample_tf_ds))[1]
sample_label_batch

## 5. Building Numerical Feature with TF Feature Column API

In [0]:
subset_swiss_df.head()

In [0]:
age_mean = subset_swiss_df['age'].describe()['mean']
age_std = subset_swiss_df['age'].describe()['std']
print("Mean age:{}\nStandard Deviation Age:{}".format(age_mean, age_std))

In [0]:
import functools
def normalize_numeric_with_zscore(col, mean, std):
    return (col - mean)/std

def create_tf_numeric_feature(col, MEAN, STD,   default_value=0):
    normalizer = functools.partial(normalize_numeric_with_zscore, mean=MEAN, std=STD)
    return tf.feature_column.numeric_column(
    key=col, default_value = default_value, normalizer_fn=normalizer, dtype=tf.float64)

In [0]:
age_tf_feature = create_tf_numeric_feature('age', age_mean, age_std)

In [0]:
def demo(feature_column, example_batch):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch))

In [0]:
print("Example continuous field:\n{}\n".format(age_tf_feature))
demo(age_tf_feature, sample_feature_batch)

## 6. Building Categorical Features with TF Feature Column API

In [0]:
categorical_example_df = encounter_df[['ENCOUNTER_ID', 'PRINCIPAL_DIAGNOSIS_CODE', 'LABEL']] 

In [0]:
# for this task need to convert label from array to scalar value
categorical_example_df['LABEL'] = categorical_example_df['LABEL'].apply(lambda x: np.unique(x)[0])

In [0]:
categorical_example_df.head()

### High Cardinality for Principal Diagnosis Code

In [0]:
categorical_example_df['PRINCIPAL_DIAGNOSIS_CODE'].nunique()

### Generate Vocabulary File

In [0]:
#make vocab dir
import os
#os.mkdir("./vocab/")

In [0]:
# build vocab for categorical features
def write_vocabulary_file(vocab_list, field_name, default_value, vocab_dir='./vocab/'):
    output_file_path = os.path.join(vocab_dir, str(field_name) + "_vocab.txt")
    # put default value in first row as TF requires
    vocab_list = np.insert(vocab_list, 0, default_value, axis=0) 
    df = pd.DataFrame(vocab_list).to_csv(output_file_path, index=None, header=None)
    return output_file_path

def build_vocab_files(df, categorical_column_list, default_value='00'):
    vocab_files_list = []
    for c in categorical_column_list:
        v_file = write_vocabulary_file(df[c].unique(), c, default_value)
        vocab_files_list.append(v_file)
    return vocab_files_list

In [0]:
categorical_field_list = ["PRINCIPAL_DIAGNOSIS_CODE"]
vocab_files_list = build_vocab_files(categorical_example_df, categorical_field_list)

### Build TF Dataset from Pandas Dataframe

In [0]:
BATCH_SIZE = 64
PREDICTOR_FIELD = 'LABEL'
categorical_tf_ds = df_to_dataset(categorical_example_df, PREDICTOR_FIELD, batch_size=BATCH_SIZE)

### Use TF Feature Column API to read from vocab file

In [0]:
vocab_files_list[0]

In [0]:
principal_diagnosis_vocab = tf.feature_column.categorical_column_with_vocabulary_file(
            key="PRINCIPAL_DIAGNOSIS_CODE", vocabulary_file = vocab_files_list[0], num_oov_buckets=1)

### Create one-hot encoding  from vocab column feature function

In [0]:
one_hot_principal_diagnosis_feature = tf.feature_column.indicator_column(principal_diagnosis_vocab)

In [0]:
categorical_tf_ds_batch = next(iter(categorical_tf_ds))[0]

In [0]:
demo(one_hot_principal_diagnosis_feature, categorical_tf_ds_batch)