
# Read and preprocess imaging data from S3 and store features in SageMaker FeatureStore


In [33]:
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io, os
from time import gmtime, strftime, sleep
import time
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup


## Set up SageMaker FeatureStore


In [43]:
region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

role = get_execution_role()
s3_client = boto3.client('s3', region_name=region)

default_s3_bucket_name = feature_store_session.default_bucket()
prefix = 'sagemaker-featurestore-demo'


## Get data from S3


In [44]:
# Get data from S3 
bucket_imag = 'guidance-multimodal-hcls-healthai-machinelearning/preprocessing'
#bucket_imag = <S3-bucket-name>

# imaging data 
data_key_imag = 'final_imaging_df.csv'
#data_key_imag = <file-name.csv>

data_location_imag = 's3://{}/{}'.format(bucket_imag, data_key_imag)
data_imaging = pd.read_csv(data_location_imag)


## Preprocess Data


In [54]:
# Replacing NaN with zeros
data_imaging_1 = data_imaging.copy()
data_imaging_1 = data_imaging_1.replace(np.nan, 0)
data_imaging_1 = data_imaging_1.astype(str)

#Converting all diagnosis codes to a set
data_imaging_1[['alzheimers_prediction', 'coronary_heart_disease_prediction', 'stroke_prediction', 'hypertension_prediction']] = '0'
data_imaging_pred = data_imaging_1.copy()
for i in range(len(data_imaging_1)):
    data_imaging_pred.iloc[i]['diagnosisCode'] = set(data_imaging_pred.iloc[i]['diagnosisCode'].replace('\'', '').replace(' ', '').replace('{', '').replace('}', '').split(','))

# Adding a column for prediction of Alzheimer's disease code '26929004'
# Adding a column for prediction of Coronary heart disease '53741008'
# Adding a column for prediction of Stroke code '230690007'
# Adding a column for prediction of Hypertension code '59621000'
for i in range(len(data_imaging_pred)):
    if "26929004" in (data_imaging_pred.iloc[i]['diagnosisCode']):
        data_imaging_pred.iloc[i]['alzheimers_prediction']  =  '1'
    if "53741008" in (data_imaging_pred.iloc[i]['diagnosisCode']):
        data_imaging_pred.iloc[i]['coronary_heart_disease_prediction']  =  '1'
    if "230690007" in (data_imaging_pred.iloc[i]['diagnosisCode']):
        data_imaging_pred.iloc[i]['stroke_prediction']  =  '1'
    if "59621000" in (data_imaging_pred.iloc[i]['diagnosisCode']):
        data_imaging_pred.iloc[i]['hypertension_prediction']  =  '1'
print("Patients with Alzheimer's disease = ", len(data_imaging_pred[data_imaging_pred['alzheimers_prediction']=='1']))
print("Patients with Coronary Heart disease = ", len(data_imaging_pred[data_imaging_pred['coronary_heart_disease_prediction']=='1']))
print("Patients with Stroke = ", len(data_imaging_pred[data_imaging_pred['stroke_prediction']=='1']))
print("Patients with Hypertension = ", len(data_imaging_pred[data_imaging_pred['hypertension_prediction']=='1']))

# Delete columns with leakage and features irrelevant for model training
list_delete_cols = ['diagnosisDescription', 'diagnosisCode']
data_imaging_pred.drop(list_delete_cols, axis=1, inplace=True)

data_imaging_pred.head(10)

Patients with Alzheimer's disease =  19
Patients with Coronary Heart disease =  26
Patients with Stroke =  72
Patients with Hypertension =  47


Unnamed: 0,patientID,original_shape_elongation,original_shape_flatness,original_shape_leastaxislength,original_shape_majoraxislength,original_shape_maximum2ddiametercolumn,original_shape_maximum2ddiameterrow,original_shape_maximum2ddiameterslice,original_shape_maximum3ddiameter,original_shape_meshvolume,...,original_ngtdm_busyness,original_ngtdm_coarseness,original_ngtdm_complexity,original_ngtdm_contrast,original_ngtdm_strength,imagesetid,alzheimers_prediction,coronary_heart_disease_prediction,stroke_prediction,hypertension_prediction
0,0074596f-5fd0-7965-db0f-cce71c81567d,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.29295165489716,4.356913065871555e-07,146182262.74738657,2.3802635598883857,8.547276310306838,'d7298fe7dde4537b8343b5a702979aea',0,0,0,0
1,0618424e-ed51-3100-ea5c-e46492bfd65b,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.138414157730995,4.34244947550801e-07,159175557.71662363,2.321636166760196,9.241749116927542,'e5e2ccf0487eed5523395431675fa708',0,0,0,0
2,06cc033a-f09a-0fb2-4a1a-4c4d99d88839,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.2506976152974696,4.342879189391609e-07,153429911.60439494,2.358067515051934,9.02168678253902,'1a3c6b0044e2b58a13c106896da369ef',1,0,1,1
3,06d43bcb-8322-3c2b-40f1-189b2852b5ab,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.155856370322871,4.35602879437664e-07,162738834.5359922,2.6380655940946416,8.776968231552623,'41b14124cdd7f0ea3d459bcfe6120e7f',0,0,0,0
4,08acefb1-271e-01d2-1bb6-aae9f65dbb42,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.212123075939246,4.33677886596948e-07,149601209.25392962,2.4127886848959914,8.601098896888011,'dd9a6771bf229a91f03f608c336141de',0,0,1,0
5,0b73e9c9-18c0-5c0e-c5d5-95d7ca56c050,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.214091567167489,4.332679672921037e-07,154658175.41870308,2.3328494894814447,9.106481386718237,'1526fea901654ad5e45f7d5f22377f81',1,0,1,1
6,0bb4b1f7-4128-89ef-a023-b81379c30f4f,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.054464301296742,4.337451414359168e-07,169394497.10612768,2.4605886148613787,9.38894545484775,'b94c9e7c782da3660ea6a905e2a91b34',0,0,0,0
7,0c8ad7c0-b403-98d1-d6c9-ff1dbbce7d7c,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.20435461617733,4.322075817165929e-07,156887627.70678127,2.354916905909733,9.202929567908852,'bccc8f07e6b97098dbb92cbafef2949c',0,0,1,1
8,0f1c1042-f103-e85b-1a7a-03442861947d,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.4330931656659374,4.31944012088205e-07,146534323.52444685,1.5755890861565278,11.269159030234578,'df83b4160e70a45bee0af99f9785d1eb',0,0,0,0
9,14e884a7-dc93-7909-9585-2857c6eb29e2,1.0,1.0,295.6010825419961,295.6010825419961,361.33225707096784,361.33225707096784,361.33225707096784,442.2510599195891,16776832.666666666,...,2.2534431526907373,4.3244561806382613e-07,149554805.76866412,2.346621297103984,8.875513496551257,'20e77ca11e2b601b8ff9b9743efc1014',0,0,1,1



## Ingest data into FeatureStore


In [52]:
imaging_feature_group_name = 'imaging-feature-group' 
imaging_feature_group = FeatureGroup(name=imaging_feature_group_name, sagemaker_session=feature_store_session)

current_time_sec = int(round(time.time()))

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        print (label)
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

# Cast object dtype to string. SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(data_imaging_pred)

# Record identifier and event time feature names
record_identifier_feature_name = "patientID"
event_time_feature_name = "EventTime"

# Append EventTime feature
data_imaging_pred[event_time_feature_name] = pd.Series([current_time_sec]*len(data_imaging_pred), dtype="float64")

## If event time generates NaN
data_imaging_pred[event_time_feature_name] = data_imaging_pred[event_time_feature_name].fillna(0)

# Load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
imaging_feature_group.load_feature_definitions(data_frame=data_imaging_pred); # output is suppressed


def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(15)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

imaging_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True
)

wait_for_feature_group_creation_complete(feature_group=imaging_feature_group)

imaging_feature_group.ingest(
    data_frame=data_imaging_pred, max_workers=5, wait=True
)

patientID
original_shape_elongation
original_shape_flatness
original_shape_leastaxislength
original_shape_majoraxislength
original_shape_maximum2ddiametercolumn
original_shape_maximum2ddiameterrow
original_shape_maximum2ddiameterslice
original_shape_maximum3ddiameter
original_shape_meshvolume
original_shape_minoraxislength
original_shape_sphericity
original_shape_surfacearea
original_shape_surfacevolumeratio
original_shape_voxelvolume
original_firstorder_10percentile
original_firstorder_90percentile
original_firstorder_energy
original_firstorder_entropy
original_firstorder_interquartilerange
original_firstorder_kurtosis
original_firstorder_maximum
original_firstorder_meanabsolutedeviation
original_firstorder_mean
original_firstorder_median
original_firstorder_minimum
original_firstorder_range
original_firstorder_robustmeanabsolutedeviation
original_firstorder_rootmeansquared
original_firstorder_skewness
original_firstorder_totalenergy
original_firstorder_uniformity
original_firstorder_



Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup imaging-feature-group successfully created.


IngestionManagerPandas(feature_group_name='imaging-feature-group', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f72e6f92e10>, sagemaker_session=<sagemaker.session.Session object at 0x7f72e7f1f410>, max_workers=5, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7f72f4643dd0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])