In [2]:
%%sh
pip install  -q --upgrade pip
pip install -q --upgrade boto3 botocore
pip install -q tqdm nibabel pydicom numpy pathlib2 pylibjpeg-openjpeg
pip install -q "itk>=5.3rc4" "monai-weekly[nibabel, matplotlib, tqdm]"
pip install -q SimpleITK
pip install -q pyradiomics

In [3]:
import pydicom
import json
import uuid
import logging
import importlib  
import boto3
from time import gmtime, strftime
from openjpeg import decode
import io
import sys
import time
import os
from src.Api import MedicalImaging 
# import itk
from botocore.exceptions import ClientError
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

logging.basicConfig( level="INFO" )
# logging.basicConfig( level="DEBUG" )

medicalimaging = MedicalImaging()
s3 = boto3.client('s3')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
boto_session = boto3.Session(region_name=region)

account_id = boto3.client('sts').get_caller_identity().get('Account')
bucket = sagemaker.Session().default_bucket()
print(f"S3 Bucket is {bucket}")
print(f"IAM role is {role}")

In [4]:
DatastoreName = "CoherentDataStoreWorkshop"
datastoreList = medicalimaging.listDatastores()

res_createstore = None
for datastore in datastoreList["datastoreSummaries"]:
    if datastore["datastoreName"] == DatastoreName:
        res_createstore = datastore
        break
if res_createstore is None:        
    res_createstore = medicalimaging.createDatastore(DatastoreName)

datastoreId = res_createstore['datastoreId']
res_getstore = medicalimaging.getDatastore(res_createstore['datastoreId'])    
status = res_getstore['datastoreProperties']['datastoreStatus']
while status!='ACTIVE':
    time.sleep(30)
    res_getstore = medicalimaging.getDatastore(res_createstore['datastoreId'])    
    status = res_getstore['datastoreProperties']['datastoreStatus']
    print(status)
print(f"datastoreId: {datastoreId}; status: {status}")

In [6]:
InputBucketName = "curie-input-coherent-dataset"

res_startimportjob = medicalimaging.startImportJob(
    res_createstore['datastoreId'],
    role,
    's3://'+InputBucketName+'/dicom/', 
    's3://'+bucket+'/ahi_importjob_output/'
)

jobId = res_startimportjob['jobId']
jobstatus = medicalimaging.getImportJob(datastoreId, jobId)['jobProperties']['jobStatus']
while jobstatus!='COMPLETED':
    time.sleep(30)
    jobstatus = medicalimaging.getImportJob(datastoreId, jobId)['jobProperties']['jobStatus']
print(f"jobstatus is {jobstatus}")

## Prepare SageMaker processing job input files

In [13]:
output_data_prefix='ahi_radiogenomics'
output_data_uri='s3://%s/%s' % (bucket, output_data_prefix)
input_data_uri='s3://%s/%s/inputfiles' % (bucket, output_data_prefix) 

imageSetIds = {}
s3res = boto3.resource('s3')
try:
    response = s3.head_object(Bucket=bucket, Key=f"ahi_importjob_output/{datastoreId}-DicomImport-{jobId}/job-output-manifest.json")
    if response['ResponseMetadata']['HTTPStatusCode'] == 200:
        data = s3.get_object(Bucket=bucket, Key=f"ahi_importjob_output/{datastoreId}-DicomImport-{jobId}/SUCCESS/success.ndjson")
        contents = data['Body'].read().decode("utf-8")
        for l in contents.splitlines():
            isid = json.loads(l)['importResponse']['imageSetId']
            if isid in imageSetIds:
                imageSetIds[isid]+=1
            else:
                imageSetIds[isid]=1
                inputkey = f"{output_data_prefix}/inputfiles/{datastoreId}/{isid}.json"
                s3object = s3res.Object(bucket, inputkey)
                jsonobj={}
                jsonobj['datastoreid']=datastoreId
                jsonobj['imagesetid']=isid
                s3object.put(
                    Body=(bytes(json.dumps(jsonobj).encode('UTF-8')))
                )
except ClientError:
    pass

print("number of image sets: {}".format(len(imageSetIds)))

## Retrive Pixel frames in parallel

This is an example to retrieve pixel frames from Amazon HealthImaging datastore in parallel. This is optional to run

In [14]:
from joblib import Parallel, delayed
import time
data = s3.get_object(Bucket=bucket, Key=inputkey)
jsoninput = json.loads(data['Body'].read().decode("utf-8"))
json_dicom_header = medicalimaging.getMetadata(jsoninput['datastoreid'],  jsoninput['imagesetid'])
frameIds = []
pixels=[]
tic = time.perf_counter()
for series in json_dicom_header["Study"]["Series"]:
    for instances in json_dicom_header["Study"]["Series"][series]["Instances"]:
        # print( json_dicom_header["Study"]["Series"][series]["Instances"][instances])
        for frame in json_dicom_header["Study"]["Series"][series]["Instances"][instances]["ImageFrames"]:
            frameId = frame["ID"]
            frameIds.append(frameId)

def getPixels(frameId):
    pixel = medicalimaging.getFramePixels(jsoninput['datastoreid'],  jsoninput['imagesetid'], frameId)
    return pixel

pixels = Parallel(n_jobs=-1, backend='threading')(delayed(getPixels)(f) for f in frameIds)
toc = time.perf_counter()
print(f"retrieval in {toc - tic:0.4f} seconds")
print(f"number of pixel frames: {len(pixels)}")


## Calculate one sample radiomics feature and Create SageMaker FeatureStore based on it (optional)

The sample radiomics feature calculation takes about 1 hour, you can skip this step and run next cell to create SageMaker FeatureStore

In [7]:
import numpy as np
import SimpleITK as sitk
from radiomics.featureextractor import RadiomicsFeatureExtractor
import pandas as pd

img = sitk.GetImageFromArray(pixels)
ma_arr = np.ones(img.GetSize()[::-1])  # reverse the order as image is xyz, array is zyx
ma = sitk.GetImageFromArray(ma_arr)
ma.CopyInformation(img)  # Copy geometric info
extractor = RadiomicsFeatureExtractor()
features = extractor.execute(img, ma)
new_dict={}
for featureName in features.keys():
    if isinstance(features[featureName], np.ndarray):
        new_dict[featureName]=float(features[featureName])
    else:
        new_dict[featureName]=features[featureName]
df=pd.DataFrame.from_dict(new_dict, orient='index').T
df=df.convert_dtypes(convert_integer=False)
df['ImageSetId']=jsoninput['imagesetid']
df['EventTime']=float(round(time.time()))
print(df)

from sagemaker.feature_store.feature_group import FeatureGroup
suffix=uuid.uuid1().hex # to be used in resource names
feature_store_name = 'imaging-feature-group-%s' % suffix
offline_store_s3uri = '%s/multimodal-imaging-featurestore' % output_data_uri

featurestore_runtime= boto3.client('sagemaker-featurestore-runtime', region_name=region)
sagemaker_client = boto3.client(service_name='sagemaker', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)
feature_group = FeatureGroup(name=feature_store_name, sagemaker_session=feature_store_session)
feature_group.load_feature_definitions(data_frame=df)

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe()['FeatureGroupStatus']
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe()['FeatureGroupStatus']
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

feature_group.create(
    s3_uri=offline_store_s3uri,
    record_identifier_name='ImageSetId',
    event_time_feature_name='EventTime',
    role_arn=role,
    enable_online_store=True
)

wait_for_feature_group_creation_complete(feature_group=feature_group)

# feature_group.ingest(
#     data_frame=df, max_workers=5, wait=True
# )

## Create SageMaker Feature Store

In [8]:
from sagemaker.feature_store.feature_group import FeatureGroup
suffix=uuid.uuid1().hex # to be used in resource names
feature_store_name = 'imaging-feature-group-%s' % suffix
offline_store_s3uri = '%s/multimodal-imaging-featurestore' % output_data_uri

featurestore_runtime= boto3.client('sagemaker-featurestore-runtime', region_name=region)
sagemaker_client = boto3.client(service_name='sagemaker', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)
feature_group = FeatureGroup(name=feature_store_name, sagemaker_session=feature_store_session)
    
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe()['FeatureGroupStatus']
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe()['FeatureGroupStatus']
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


if feature_store_name not in [i['FeatureGroupName'] for i in sagemaker_client.list_feature_groups()['FeatureGroupSummaries']]:
    reponse = sagemaker_client.create_feature_group(
         FeatureGroupName=feature_store_name,
         OfflineStoreConfig={
             'S3StorageConfig': {
                'S3Uri': offline_store_s3uri
            }
         },
         RecordIdentifierFeatureName='ImageSetId',
         EventTimeFeatureName='EventTime',
         RoleArn=role,
         FeatureDefinitions=[
                {'FeatureName': 'diagnostics_Versions_PyRadiomics', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Versions_Numpy', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Versions_SimpleITK', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Versions_PyWavelet', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Versions_Python', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Configuration_Settings', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Configuration_EnabledImageTypes', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Image-original_Hash', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Image-original_Dimensionality', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Image-original_Spacing', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Image-original_Size', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Image-original_Mean', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Image-original_Minimum', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Image-original_Maximum', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_Hash', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_Spacing', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_Size', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_BoundingBox', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_VoxelNum', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_VolumeNum', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_CenterOfMassIndex', 'FeatureType': 'String'},
                {'FeatureName': 'diagnostics_Mask-original_CenterOfMass', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_Elongation', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_Flatness', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_LeastAxisLength', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_MajorAxisLength', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_Maximum2DDiameterColumn', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_Maximum2DDiameterRow', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_Maximum2DDiameterSlice', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_Maximum3DDiameter', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_MeshVolume', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_MinorAxisLength', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_Sphericity', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_SurfaceArea', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_SurfaceVolumeRatio', 'FeatureType': 'String'},
                {'FeatureName': 'original_shape_VoxelVolume', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_10Percentile', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_90Percentile', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Energy', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Entropy', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_InterquartileRange', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Kurtosis', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Maximum', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_MeanAbsoluteDeviation', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Mean', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Median', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Minimum', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Range', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_RobustMeanAbsoluteDeviation', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_RootMeanSquared', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Skewness', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_TotalEnergy', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Uniformity', 'FeatureType': 'String'},
                {'FeatureName': 'original_firstorder_Variance', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Autocorrelation', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_ClusterProminence', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_ClusterShade', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_ClusterTendency', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Contrast', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Correlation', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_DifferenceAverage', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_DifferenceEntropy', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_DifferenceVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Id', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Idm', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Idmn', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Idn', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Imc1', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_Imc2', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_InverseVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_JointAverage', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_JointEnergy', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_JointEntropy', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_MCC', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_MaximumProbability', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_SumAverage', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_SumEntropy', 'FeatureType': 'String'},
                {'FeatureName': 'original_glcm_SumSquares', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_DependenceEntropy', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_DependenceNonUniformity', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_DependenceNonUniformityNormalized', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_DependenceVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_GrayLevelNonUniformity', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_GrayLevelVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_HighGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_LargeDependenceEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_LargeDependenceLowGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_LowGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_SmallDependenceEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_GrayLevelNonUniformity', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_GrayLevelNonUniformityNormalized', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_GrayLevelVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_HighGrayLevelRunEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_LongRunEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_LongRunHighGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_LongRunLowGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_LowGrayLevelRunEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_RunEntropy', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_RunLengthNonUniformity', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_RunLengthNonUniformityNormalized', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_RunPercentage', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_RunVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_ShortRunEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glrlm_ShortRunLowGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_GrayLevelNonUniformity', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_GrayLevelNonUniformityNormalized', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_GrayLevelVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_HighGrayLevelZoneEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_LargeAreaEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_LargeAreaHighGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_LargeAreaLowGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_LowGrayLevelZoneEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_SizeZoneNonUniformity', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_SizeZoneNonUniformityNormalized', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_SmallAreaEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_SmallAreaHighGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_SmallAreaLowGrayLevelEmphasis', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_ZoneEntropy', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_ZonePercentage', 'FeatureType': 'String'},
                {'FeatureName': 'original_glszm_ZoneVariance', 'FeatureType': 'String'},
                {'FeatureName': 'original_ngtdm_Busyness', 'FeatureType': 'String'},
                {'FeatureName': 'original_ngtdm_Coarseness', 'FeatureType': 'String'},
                {'FeatureName': 'original_ngtdm_Complexity', 'FeatureType': 'String'},
                {'FeatureName': 'original_ngtdm_Contrast', 'FeatureType': 'String'},
                {'FeatureName': 'original_ngtdm_Strength', 'FeatureType': 'String'},
                {'FeatureName': 'ImageSetId', 'FeatureType': 'String'},
                {'FeatureName': 'EventTime', 'FeatureType': 'Fractional'}
         ]
    )

wait_for_feature_group_creation_complete(feature_group)

## Build SageMaker processing job container

In [81]:
proj_name = 'medical-image-processing'
image_tag= '1.0'

dockerfile_content="""FROM python:3.9-slim-buster

COPY requirements.txt /opt/
COPY ahiradiomics.py /opt/
RUN pip3 install --no-cache-dir -r /opt/requirements.txt
ENV PYTHONUNBUFFERED=TRUE
ENV AWS_DEFAULT_REGION=us-east-1

COPY Api.py /opt/

ENTRYPOINT ["python3", "/opt/ahiradiomics.py"]
"""

buildspec_content ="""version: 0.2
phases:
  pre_build:
    commands:
      - echo Logging in to Amazon ECR...
      - aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
  build:
    commands:
      - echo Build started on `date`
      - echo Building the Docker image...          
      - docker build -t $IMAGE_REPO_NAME:$IMAGE_TAG -f Dockerfile . 
      - docker tag $IMAGE_REPO_NAME:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG      
  post_build:
    commands:
      - echo Build completed on `date`
      - echo Pushing the Docker image...
      - docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
"""

requirements_content="""pydicom == 2.3.1
numpy == 1.24.2
nibabel == 5.1.0
matplotlib == 3.4.3
pandas == 1.3.4
pyradiomics == 3.1.0
sagemaker == 2.145.0
boto3 >= 1.28.12
botocore >= 1.31.12
pylibjpeg-openjpeg == 1.3.2
SimpleITK == 2.2.1
"""


ahiradiomics_content = """#!/usr/bin/env python
import argparse
from glob import glob
import pydicom
import nibabel as nib
import numpy as np
import pandas as pd
import sys
import os
import json
import time
import logging
from radiomics import featureextractor
import SimpleITK as sitk
from Api import MedicalImaging 
from functools import reduce
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
boto_session = boto3.Session(region_name=region)
role = get_execution_role()

s3 = boto3.client('s3', region_name=region)
s3res = boto3.resource('s3', region_name=region)
sagemaker_client = boto3.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto3.client('sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

medicalimaging = MedicalImaging()


def split_s3_path(s3_path):
    path_parts=s3_path.replace("s3://","").split("/")
    bucket=path_parts.pop(0)
    key="/".join(path_parts)
    return bucket, key


def compute_features(datastoreId, imagesetId):
    json_dicom_header = medicalimaging.getMetadata(datastoreId, imagesetId)
    pixels = []
    for series in json_dicom_header["Study"]["Series"]:
        for instances in json_dicom_header["Study"]["Series"][series]["Instances"]:
            # print( json_dicom_header["Study"]["Series"][series]["Instances"][instances])
            for frame in json_dicom_header["Study"]["Series"][series]["Instances"][instances]["ImageFrames"]:
                frameId = frame["ID"]
                pixel = medicalimaging.getFramePixels(datastoreId,  imagesetId, frameId)
                pixels.append(pixel)
    
    extractor = featureextractor.RadiomicsFeatureExtractor()
    img = sitk.GetImageFromArray(pixels)
    ma_arr = np.ones(img.GetSize()[::-1])  # reverse the order as image is xyz, array is zyx
    ma = sitk.GetImageFromArray(ma_arr)
    ma.CopyInformation(img)  # Copy geometric info
    featureVector = extractor.execute(img, ma)
    new_dict={}
    for featureName in featureVector.keys():
        if isinstance(featureVector[featureName], np.ndarray):
            new_dict[featureName]=float(featureVector[featureName])
        else:
            new_dict[featureName]=featureVector[featureName]
            
    df=pd.DataFrame.from_dict(new_dict, orient='index').T
    df=df.convert_dtypes(convert_integer=False, convert_floating=False)
    df['ImageSetId']=imagesetId
    df['EventTime']=float(round(time.time()))
    return df

        
    
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--datastore_id', type=str, default='f3b5edf42a114498bec2509a3c7d42fd',
                        help='AHI DataStore ID')
    parser.add_argument('--feature_store_name', type=str, default='imaging-radiogenomics-feature-group',
                        help='SageMaker Feature Store Group Name')
    parser.add_argument('--offline_store_s3uri', type=str,
                        help='SageMaker Feature Offline Store S3 URI')
    
    args = parser.parse_args()
    
    data_dir = '/opt/ml/processing/input/'
    print(os.listdir(data_dir))
    output_dir = '/opt/ml/processing/output/'
    
    
    # assume one subject comes in,
    # we need to find out where the CT dicom files are
    # and segmentation file
    logger.info('Reading images from Amazon HealthLake Imaging')
    
    jsons = glob(os.path.join(data_dir, '*.json'))
    logger.info(f"##Number of input json files to process: {len(jsons)}")
    feature_group = FeatureGroup(name=args.feature_store_name, sagemaker_session=feature_store_session)
    for j in jsons:
        with open(j, 'r') as jf:
            jsonobj = json.load(jf)
            df = compute_features(jsonobj['datastoreid'], jsonobj['imagesetid'])
            # ingest features into a FeatureStore
            feature_group.ingest(data_frame=df, max_workers=1, wait=True)
            
    logging.info('Processing done')
"""

api_content="""import array
import pydicom
from pydicom.sequence import Sequence
from pydicom import Dataset , DataElement 
from pydicom.dataset import FileMetaDataset
from pydicom.uid import UID
import json
import logging
import importlib  
import boto3
from openjpeg import decode
import io
import sys
import time
import os
import gzip

logging.basicConfig( level="INFO" )

class MedicalImaging: 
    def __init__(self):
        session = boto3.Session()
        self.client = boto3.client('medical-imaging')
    
    def stopwatch(self, start_time, end_time):
        time_lapsed = end_time - start_time
        return time_lapsed*1000 
    
    
    def getMetadata(self, datastoreId, imageSetId):
        start_time = time.time()
        dicom_study_metadata = self.client.get_image_set_metadata(datastoreId=datastoreId , imageSetId=imageSetId )
        json_study_metadata = json.loads( gzip.decompress(dicom_study_metadata["imageSetMetadataBlob"].read()) )
        end_time = time.time()
        logging.info(f"Metadata fetch  : {self.stopwatch(start_time,end_time)} ms")   
        return json_study_metadata

    
    def listDatastores(self):
        start_time = time.time()
        response = self.client.list_datastores()
        end_time = time.time()
        logging.info(f"List Datastores  : {self.stopwatch(start_time,end_time)} ms")        
        return response
    
    
    def createDatastore(self, datastoreName):
        start_time = time.time()
        response = self.client.create_datastore(datastoreName=datastoreName)
        end_time = time.time()
        logging.info(f"Create Datastore  : {self.stopwatch(start_time,end_time)} ms")        
        return response
    
    
    def getDatastore(self, datastoreId):
        start_time = time.time()
        response = self.client.get_datastore(datastoreId=datastoreId)
        end_time = time.time()
        logging.info(f"Get Datastore  : {self.stopwatch(start_time,end_time)} ms")        
        return response
    
    
    def deleteDatastore(self, datastoreId):
        start_time = time.time()
        response = self.client.delete_datastore(datastoreId=datastoreId)
        end_time = time.time()
        logging.info(f"Delete Datastore  : {self.stopwatch(start_time,end_time)} ms")        
        return response
    
    
    def startImportJob(self, datastoreId, IamRoleArn, inputS3, outputS3):
        start_time = time.time()
        response = self.client.start_dicom_import_job(
            datastoreId=datastoreId,
            dataAccessRoleArn = IamRoleArn,
            inputS3Uri = inputS3,
            outputS3Uri = outputS3,
            clientToken = "demoClient"
        )
        end_time = time.time()
        logging.info(f"Start Import Job  : {self.stopwatch(start_time,end_time)} ms")        
        return response
    
    
    def getImportJob(self, datastoreId, jobId):
        start_time = time.time()
        response = self.client.get_dicom_import_job(datastoreId=datastoreId, jobId=jobId)
        end_time = time.time()
        logging.info(f"Get Import Job  : {self.stopwatch(start_time,end_time)} ms")        
        return response
    
    
    def getFramePixels(self, datastoreId, imageSetId, imageFrameId):
        start_time = time.time()
        res = self.client.get_image_frame(
            datastoreId=datastoreId,
            imageSetId=imageSetId,
            imageFrameInformation={
                'imageFrameId': imageFrameId
            })
        end_time = time.time()
        logging.debug(f"Frame fetch     : {self.stopwatch(start_time,end_time)} ms") 
        start_time = time.time() 
        b = io.BytesIO()
        b.write(res['imageFrameBlob'].read())
        b.seek(0)
        d = decode(b)
        end_time = time.time()
        logging.debug(f"Frame decode    : {self.stopwatch(start_time,end_time)} ms")    
        return d 
"""

# prepare the files for the checkin
put_files=[{
               'filePath': 'Dockerfile',
               'fileContent': dockerfile_content
            },
            {
               'filePath': 'requirements.txt',
               'fileContent': requirements_content
            },
            {
               'filePath': 'ahiradiomics.py',
               'fileContent': ahiradiomics_content
            },
            {
               'filePath': 'Api.py',
               'fileContent': api_content
            },
            {
               'filePath': 'buildspec.yml',
               'fileContent': buildspec_content
            }]

def commit_files(proj_name, branch_name, put_files, parent_commit_id):
    codecommit_client = boto3.client('codecommit')
    if parent_commit_id:
        resp = codecommit_client.create_commit(repositoryName=proj_name, branchName=branch_name, 
                                               parentCommitId=parent_commit_id,
                                               putFiles=put_files)
    else:
        resp = codecommit_client.create_commit(repositoryName=proj_name, branchName=branch_name, 
                                               putFiles=put_files)
        
    print("Finished commit")

In [82]:
from botocore.exceptions import ClientError
ecr_client = boto3.client('ecr')
try:
    resp = ecr_client.create_repository(repositoryName=proj_name)
except ClientError as e:
    if e.response['Error']['Code'] == 'RepositoryAlreadyExistsException':
        print(f"ECR Repo {proj_name} already exists, skip")

codecommit_client = boto3.client('codecommit')
try:
    resp = codecommit_client.create_repository(repositoryName=proj_name)
except ClientError as e:
    if e.response['Error']['Code'] == 'RepositoryNameExistsException':
        print(f"Repo {proj_name} exists, use that one")

In [83]:
try:
    resp = codecommit_client.get_branch(repositoryName=proj_name, branchName='main')
    parent_commit_id = resp['branch']['commitId']
except ClientError as e:
    if e.response['Error']['Code'] == 'BranchDoesNotExistException':
        # the repo is new, create it 
        commit_files(proj_name, "main", put_files,  None)
else:
    try:
        resp = commit_files(proj_name, "main", put_files, parent_commit_id)
    except ClientError as ee:
        if ee.response['Error']['Code'] == 'NoChangeException':
            print('No change detected. skip commit')

Finished commit


In [84]:
codebuild_client = boto3.client('codebuild')
codebuild_name = f"Build-{proj_name}" 
codecommit_name = f"Source-{proj_name}"
try: 
    resp = codebuild_client.create_project(name=codebuild_name, 
                                       description="JSL NLP build demo",
                                       source= {
                                           'type': "CODEPIPELINE"
                                       },
                                       artifacts= {
                                            "type": "CODEPIPELINE",
                                            "name": proj_name
                                       },
                                       environment= {
                                            "type": "LINUX_CONTAINER",
                                            "image": "aws/codebuild/amazonlinux2-x86_64-standard:3.0",
                                            "computeType": "BUILD_GENERAL1_SMALL",
                                            "environmentVariables": [
                                                {
                                                    "name": "AWS_DEFULT_REGION",
                                                    "value": region,
                                                    "type": "PLAINTEXT"
                                                },
                                                {
                                                    "name": "AWS_ACCOUNT_ID",
                                                    "value": account_id,
                                                    "type": "PLAINTEXT"
                                                },
                                                {
                                                    "name": "IMAGE_REPO_NAME",
                                                    "value": proj_name,
                                                    "type": "PLAINTEXT"
                                                },
                                                {
                                                    "name": "IMAGE_TAG",
                                                    "value": image_tag,
                                                    "type": "PLAINTEXT"
                                                }
                                            ],
                                            "privilegedMode": True,
                                            "imagePullCredentialsType": "CODEBUILD"               
                                       },
                                       logsConfig= {
                                                "cloudWatchLogs": {
                                                    "status": "ENABLED",
                                                    "groupName": proj_name
                                                },
                                                "s3Logs": {
                                                    "status": "DISABLED"
                                                }
                                        },
                                        serviceRole= role
                                      )
except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceAlreadyExistsException':
        print(f"CodeBuild project {proj_name} exists, skip...")
    else:
        raise e


print(f"CodeBuild project name {codebuild_name}")

CodeBuild project name Build-medical-image-processing


In [9]:
codepipeline_client = boto3.client('codepipeline')

stage1 = {
    "name":f"{codecommit_name}",
    "actions": [
        {
            "name": "Source",
            "actionTypeId": {
                "category": "Source",
                "owner": "AWS",
                "provider": "CodeCommit",
                "version": "1"
            },
            "runOrder": 1,
            "configuration": {
                "BranchName": "main",
                "OutputArtifactFormat": "CODE_ZIP",
                "PollForSourceChanges": "true",
                "RepositoryName": proj_name
            },
            "outputArtifacts": [
                {
                    "name": "SourceArtifact"
                }
            ],
            "inputArtifacts": [],
            "region": region,
            "namespace": "SourceVariables"
        }
    ]
}

stage2 = {
   "name": f"{codebuild_name}",
    "actions": [
        {
            "name": "Build",
            "actionTypeId": {
                "category": "Build",
                "owner": "AWS",
                "provider": "CodeBuild",
                "version": "1"
            },
            "runOrder": 1,
            "configuration": {
                "ProjectName": codebuild_name
            },
            "outputArtifacts": [
                {
                    "name": "BuildArtifact"
                }
            ],
            "inputArtifacts": [
                {
                    "name": "SourceArtifact"
                }
            ],
            "region": region,
            "namespace": "BuildVariables"
        }
    ]    
}


stages = [ stage1, stage2]


pipeline = {
    'name': proj_name,
    'roleArn': role,
    'artifactStore': {
        'type': 'S3',
        'location': f"sagemaker-us-east-1-{account_id}"
    }, 
    'stages': stages
}

try:
    resp = codepipeline_client.create_pipeline( pipeline= pipeline)
    print("Created pipeline",resp)
except ClientError as e:
    if e.response['Error']['Code'] == 'PipelineNameInUseException':
        print(f"Codepipeline {proj_name} already exists " )
    else:
        print("Exception: {}".format())
    


In [10]:
# We should see a container image with the image tag "mySRATools" - this is defined as an environment variable in CodeBuild
from IPython.display import display, clear_output
#resp = ecr_client.list_images(repositoryName=proj_name)
while True:
    resp = ecr_client.describe_images(repositoryName=proj_name)
    if resp['imageDetails']:
        for image in resp['imageDetails']:
            print("image pushed at: " + str(image['imagePushedAt']))
        break
    else:
        clear_output(wait=True)
        display("Build not done yet, please wait and retry this step. Please do not proceed until you see the 'image pushed' message")
        time.sleep(20)
# this is used later in job_definition for AWS Batch
image_uri= f"{account_id}.dkr.ecr.{region}.amazonaws.com/{proj_name}:{image_tag}"
print(image_uri)

In [11]:
!sed -i "s|##INPUT_DATA_S3URI##|{input_data_uri}|g" src/radiogenomics-imaging-workflow.json
!sed -i "s|##OUTPUT_DATA_S3URI##|{output_data_uri}|g" src/radiogenomics-imaging-workflow.json
!sed -i "s|##ECR_IMAGE_URI##|{image_uri}|g" src/radiogenomics-imaging-workflow.json
!sed -i "s|##IAM_ROLE_ARN##|{role}|g" src/radiogenomics-imaging-workflow.json

with open('src/radiogenomics-imaging-workflow.json') as f:
    state_machine_json = json.load(f)
    
sfn = boto3.client('stepfunctions')
state_machine_name = 'imaging-radiogenomics-workflow-%s' % suffix
sfn_response = sfn.create_state_machine(name = state_machine_name,
                                        definition = json.dumps(state_machine_json),
                                        roleArn = role,
                                        type = 'STANDARD')

stateMachineArn=sfn_response['stateMachineArn']
processing_job_name = 'image-radiomics-process-%s' % suffix

payload = {
    "PreprocessingJobName": processing_job_name,
    "FeatureStoreName": feature_store_name,
    "OfflineStoreS3Uri": offline_store_s3uri,
    "Subject": [datastoreId]
}
exeution_response = sfn.start_execution(stateMachineArn=stateMachineArn,
                                        name=suffix,
                                        input=json.dumps(payload))


In [11]:
## delete container build CI/CD pipeline and code base
codepipeline_client.delete_pipeline( name= proj_name)
codebuild_client.delete_project(name=codebuild_name)
codecommit_client.delete_repository(repositoryName=proj_name)

In [12]:
## delete ECR repo
ecr_client.delete_repository(repositoryName=proj_name, force=True)