
# Read and preprocess clinical data from S3 and store features in SageMaker FeatureStore


In [8]:
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io, os
from time import gmtime, strftime, sleep
import time
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup


## Set up SageMaker FeatureStore


In [9]:
region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

role = get_execution_role()
s3_client = boto3.client('s3', region_name=region)

default_s3_bucket_name = feature_store_session.default_bucket()
prefix = 'sagemaker-featurestore-demo'


## Get data from S3


In [10]:
# Get data from S3 
bucket_clin = 'multimodal-dataset-clinical-genomic-imaging'
#bucket_clin = <S3-bucket-name>

# Clinical data 
data_key_clin = 'final_clinical_df.csv'
#data_key_clin = <file-name.csv>

data_location_clin = 's3://{}/{}'.format(bucket_clin, data_key_clin)
data_clinical = pd.read_csv(data_location_clin)


## Preprocess Data


In [24]:
# Replacing NaN with zeros
data_clinical_1 = data_clinical.copy()
data_clinical_1 = data_clinical_1.replace(np.nan, 0)
data_clinical_1 = data_clinical_1.astype(str)

#Converting all diagnosis codes to a set
data_clinical_1[['alzheimers_prediction', 'coronary_heart_disease_prediction', 'stroke_prediction', 'hypertension_prediction']] = '0'
data_clinical_pred = data_clinical_1.copy()
for i in range(len(data_clinical_1)):
    data_clinical_pred.iloc[i]['diagnosisCode'] = set(data_clinical_pred.iloc[i]['diagnosisCode'].replace('\'', '').replace(' ', '').replace('{', '').replace('}', '').split(','))

# Adding a column for prediction of Alzheimer's disease code '26929004'
# Adding a column for prediction of Coronary heart disease '53741008'
# Adding a column for prediction of Stroke code '230690007'
# Adding a column for prediction of Hypertension code '59621000'
for i in range(len(data_clinical_pred)):
    if "26929004" in (data_clinical_pred.iloc[i]['diagnosisCode']):
        data_clinical_pred.iloc[i]['alzheimers_prediction']  =  '1'
    if "53741008" in (data_clinical_pred.iloc[i]['diagnosisCode']):
        data_clinical_pred.iloc[i]['coronary_heart_disease_prediction']  =  '1'
    if "230690007" in (data_clinical_pred.iloc[i]['diagnosisCode']):
        data_clinical_pred.iloc[i]['stroke_prediction']  =  '1'
    if "59621000" in (data_clinical_pred.iloc[i]['diagnosisCode']):
        data_clinical_pred.iloc[i]['hypertension_prediction']  =  '1'
print("Patients with Alzheimer's disease = ", len(data_clinical_pred[data_clinical_pred['alzheimers_prediction']=='1']))
print("Patients with Coronary Heart disease = ", len(data_clinical_pred[data_clinical_pred['coronary_heart_disease_prediction']=='1']))
print("Patients with Stroke = ", len(data_clinical_pred[data_clinical_pred['stroke_prediction']=='1']))
print("Patients with Hypertension = ", len(data_clinical_pred[data_clinical_pred['hypertension_prediction']=='1']))

# Delete columns with leakage and features irrelevant for model training
list_delete_cols = ['diagnosisDescription', 'diagnosisCode', 'onsetdatetime', 'name', 'addressline',
       'city', 'state', 'country', 'latitude', 'longitude']
data_clinical_pred.drop(list_delete_cols, axis=1, inplace=True)

data_clinical_pred.head(10)

Patients with Alzheimer's disease =  19
Patients with Coronary Heart disease =  26
Patients with Stroke =  72
Patients with Hypertension =  47


Unnamed: 0,patientID,diagnosticType,diagnosticCategory,diagnosticResult,gender,maritalstatus,postalcode,deceasedIndicator,deceaseddatetime,conditionId,...,medicationStatus,claim,observationType,observationText,observationDisplay,procedureType,alzheimers_prediction,coronary_heart_disease_prediction,stroke_prediction,hypertension_prediction
0,0074596f-5fd0-7965-db0f-cce71c81567d,"'History and physical note', 'Comprehensive me...","'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,0,0,0.0,0.0,0,0,...,"'stopped', 'active'",'claim',"""The ZPR1 gene exhibits a variation of 'Uncert...","None, 'Never smoker', 'Pneumonia'","None, 'Diastolic Blood Pressure'",'Measurement of respiratory function (procedur...,0,0,0,0
1,0618424e-ed51-3100-ea5c-e46492bfd65b,'Complete blood count (hemogram) panel - Blood...,"'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,0,0,0.0,0.0,0,0,...,"'stopped', 'active'",'claim','Bilirubin.total [Mass/volume] in Serum or Pla...,"None, 'Never smoker'","None, 'Diastolic Blood Pressure'",'Measurement of respiratory function (procedur...,0,0,0,0
2,06cc033a-f09a-0fb2-4a1a-4c4d99d88839,"'History and physical note', 'Comprehensive me...","'Laboratory', 'History and physical note'","'Glucose [Mass/volume] in Serum or Plasma', No...",female,M,1225.0,0.0,'1995-06-25T00:05:14-04:00',"'c7804c24-34e0-7321-f78e-16b84edd2912', '64613...",...,"'stopped', 'active'",'claim','Bilirubin.total [Mass/volume] in Serum or Pla...,"None, 'Edema (finding)', 'Rales (finding)', 'C...","None, 'Diastolic Blood Pressure'",'Measurement of respiratory function (procedur...,1,0,1,1
3,06d43bcb-8322-3c2b-40f1-189b2852b5ab,'Complete blood count (hemogram) panel - Blood...,"'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,0,0,0.0,0.0,0,0,...,"'stopped', 'active'",'claim',"""The ZPR1 gene exhibits a variation of 'Uncert...","None, 'Never smoker'","None, 'Diastolic Blood Pressure'","'Echocardiography (procedure)', 'Catheter abla...",0,0,0,0
4,08acefb1-271e-01d2-1bb6-aae9f65dbb42,"'History and physical note', 'Comprehensive me...","'Laboratory', 'History and physical note'","'Glucose', None, 'Total Cholesterol', 'Leukocy...",male,M,1904.0,0.0,0,"'1addef80-5f3a-c94a-46c0-41609ba0e62a', '8d35d...",...,"'stopped', 'active'",'claim','Bilirubin.total [Mass/volume] in Serum or Pla...,"None, 'Never smoker'","None, 'Diastolic Blood Pressure'",'Measurement of respiratory function (procedur...,0,0,1,0
5,0b73e9c9-18c0-5c0e-c5d5-95d7ca56c050,"'Urinalysis macro (dipstick) panel - Urine', '...","'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,male,M,1748.0,0.0,'2008-07-12T05:39:36-04:00',"'afcc1b8f-fcb4-d943-ed1f-760bc738c6d6', '65b21...",...,"'stopped', 'active'",'claim',"""The CDKN2B, CDKN2B-AS1 gene exhibits a variat...","None, 'Never smoker', 'Prostate enlarged on PR...","None, 'Diastolic Blood Pressure'","'Injection of adrenaline', 'Bone density scan ...",1,0,1,1
6,0bb4b1f7-4128-89ef-a023-b81379c30f4f,'Complete blood count (hemogram) panel - Blood...,"'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,0,0,0.0,0.0,0,0,...,"'stopped', 'active'",'claim',"'Body Mass Index', 'Total Cholesterol', 'Plate...","None, 'Never smoker'","None, 'Diastolic Blood Pressure'",'Measurement of respiratory function (procedur...,0,0,0,0
7,0c8ad7c0-b403-98d1-d6c9-ff1dbbce7d7c,"'History and physical note', 'Genetic analysis...","'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,female,M,1906.0,0.0,0,"'3ce0871e-4ce5-ef2c-88e0-028b755169c7', 'b926c...",...,"'stopped', 'active'",'claim',"""The ZPR1 gene exhibits a variation of 'Uncert...","None, 'Former smoker'","None, 'Diastolic Blood Pressure'","'Echocardiography (procedure)', 'CT of head (p...",0,0,1,1
8,0f1c1042-f103-e85b-1a7a-03442861947d,"'History and physical note', 'Comprehensive me...","'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,0,0,0.0,0.0,0,0,...,"'stopped', 'active'",'claim','Bilirubin.total [Mass/volume] in Serum or Pla...,"None, 'Edema (finding)', 'Rales (finding)', 'D...","None, 'Diastolic Blood Pressure'",'Measurement of respiratory function (procedur...,0,0,0,0
9,14e884a7-dc93-7909-9585-2857c6eb29e2,'Complete blood count (hemogram) panel - Blood...,"'Laboratory', 'History and physical note'",'Leukocytes [#/volume] in Blood by Automated c...,male,M,1701.0,0.0,0,"'ef306a5b-23fb-7fee-dffb-352eb839b68d', '203f5...",...,"'stopped', 'active'",'claim',"'Body Mass Index', 'Total Cholesterol', 'Plate...","None, 'Never smoker'","None, 'Diastolic Blood Pressure'","'Echocardiography (procedure)', 'Percutaneous ...",0,0,1,1



## Ingest data into FeatureStore


In [20]:
clinical_feature_group_name = 'clinical-feature-group'
clinical_feature_group = FeatureGroup(name=clinical_feature_group_name, sagemaker_session=feature_store_session)

current_time_sec = int(round(time.time()))

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        print (label)
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

# Cast object dtype to string. SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(data_clinical_pred)

# Record identifier and event time feature names
record_identifier_feature_name = "patientID"
event_time_feature_name = "EventTime"

# Append EventTime feature
data_clinical_pred[event_time_feature_name] = pd.Series([current_time_sec]*len(data_clinical_pred), dtype="float64")

## If event time generates NaN
data_clinical_pred[event_time_feature_name] = data_clinical_pred[event_time_feature_name].fillna(0)

# Load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
clinical_feature_group.load_feature_definitions(data_frame=data_clinical_pred); # output is suppressed


def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(15)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

clinical_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True
)

wait_for_feature_group_creation_complete(feature_group=clinical_feature_group)

clinical_feature_group.ingest(
    data_frame=data_clinical_pred, max_workers=5, wait=True
)

patientID
diagnosticType
diagnosticCategory
diagnosticResult
gender
maritalstatus
postalcode
deceasedIndicator
deceaseddatetime
conditionId
categoryCode
categoryDescription
vaccinations
imageCode
imagingName
imageBodyPart
imageTitle
encounterCode
encounterReason
encounterDiagnosis
hospitalizationStatus
medicationDisplay
medicationDate
medicationStatus
claim
observationType
observationText
observationDisplay
procedureType
alzheimers_prediction
coronary_heart_disease_prediction
stroke_prediction
hypertension_prediction
EventTime
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup clinical-feature-group successfully created.


IngestionManagerPandas(feature_group_name='clinical-feature-group', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7ff8ae2ffa50>, sagemaker_session=<sagemaker.session.Session object at 0x7ff8b087b2d0>, max_workers=5, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7ff8ac961210>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])