[![Licence](https://img.shields.io/badge/license-MIT-blue)](https://opensource.org/license/mit/)

# MONAI Core on AWS Workshop

Setup notebook environment using "PyTorch 1.12 Python 3.8 CPU optimized" Kernel with "t3.medium" instance type. 

<img src="../Figures/studio_setup_cpu.png" width="600">

## Download and install libraries

In [None]:
%env PIP_DISABLE_PIP_VERSION_CHECK True
%env PIP_ROOT_USER_ACTION ignore

!pip install -q --upgrade pip
!pip install -q --upgrade boto3 botocore awscli
!pip install -q tqdm nibabel pydicom numpy pathlib2 pylibjpeg-openjpeg pyathena
!pip install -q "itk>=5.3rc4" "itkwidgets[all]>=1.0a23"
!pip install --upgrade -q AHItoDICOMInterface

%load_ext autoreload
%autoreload 2

### Import Libraries and Setup Environments

In [None]:
import json
import logging 
import boto3
import io
import sys
import time
import os
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from botocore.exceptions import ClientError
from src.Api import MedicalImaging 
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig( level="INFO" )
# logging.basicConfig( level="DEBUG" )


s3 = boto3.client('s3')
medicalimaging = MedicalImaging()

account_id = boto3.client("sts").get_caller_identity()["Account"]
session = sagemaker.session.Session()
region = boto3.Session().region_name
bucket = sagemaker.Session().default_bucket()
role = f"arn:aws:iam::{account_id}:role/HealthImagingImportJobRole"  ## use this role if you have deployed the CloudFormation template described above
# role = get_execution_role()                ## use this role if you want to use SageMaker Execution role to import image into AWS HealthImaging
print(f"S3 Bucket is {bucket}")
print(f"IAM role for image import job is {role}")

suffix = int(time.time())
ahi_input_prefix = f'monaicore_tutorial_images_{suffix}/'
ahi_output_prefix = f'monaicore_tutorial_importjobs_{suffix}/'
dicom_header_prefix = f'coherent_dicom_headers_{suffix}/'

print(f"S3 prefix for input images is {ahi_input_prefix}")
print(f"S3 prefix for import job outputs is {ahi_output_prefix}")

## Data Preparation

We will use synthetic data, [Synthea Coherent Data](https://registry.opendata.aws/synthea-coherent-data/), in this workshop. To prepare dataset, follow [this guidance notebook](https://github.com/aws-solutions-library-samples/guidance-for-multi-modal-data-analysis-with-aws-health-and-ml-services/blob/main/preprocess-multimodal-data/medical-imaging/imaging-radiomics.ipynb) to download Coherent DICOM images and import into AWS HealthImaging datastore.

In [None]:
!aws s3 sync --quiet s3://guidance-multimodal-hcls-healthai-machinelearning-{region}/imaging s3://{bucket}/{ahi_input_prefix} 2>&1

### Create HealthLake Imaging Datastore if not Exists

In [None]:
DatastoreName = "CoherentDataStore"
datastoreList = medicalimaging.listDatastores()

res_createstore = None
for datastore in datastoreList["datastoreSummaries"]:
    if datastore["datastoreName"] == DatastoreName:
        res_createstore = datastore
        break
if res_createstore is None:        
    res_createstore = medicalimaging.createDatastore(DatastoreName)

datastoreId = res_createstore['datastoreId']
res_getstore = medicalimaging.getDatastore(res_createstore['datastoreId'])    
status = res_getstore['datastoreProperties']['datastoreStatus']
while status!='ACTIVE':
    time.sleep(10)
    res_getstore = medicalimaging.getDatastore(res_createstore['datastoreId'])    
    status = res_getstore['datastoreProperties']['datastoreStatus']
    print(status)
print(f"datastoreId: {datastoreId}; status: {status}")

### Import data into HealthLake Imaging 

Loading 300 DICOM files takes long time. The workshop environment may have preloaded with these DICOM images.

In [None]:
res_startimportjob = medicalimaging.startImportJob(
    res_createstore['datastoreId'],
    role,
    f"s3://{bucket}/{ahi_input_prefix}", 
    f"s3://{bucket}/{ahi_output_prefix}"
)

jobId = res_startimportjob['jobId']
jobstatus = medicalimaging.getImportJob(datastoreId, jobId)['jobProperties']['jobStatus']
while jobstatus not in ['COMPLETED', 'FAILED']:
    time.sleep(30)
    jobstatus = medicalimaging.getImportJob(datastoreId, jobId)['jobProperties']['jobStatus']
print(f"jobstatus is {jobstatus}")

### Retrieve ImageSet IDs in Output Bucket

In [None]:
imageSetIds = {}
try:
    response = s3.head_object(Bucket=bucket, Key=f"{ahi_output_prefix}{datastoreId}-DicomImport-{jobId}/job-output-manifest.json")
    if response['ResponseMetadata']['HTTPStatusCode'] == 200:
        data = s3.get_object(Bucket=bucket, Key=f"{ahi_output_prefix}{datastoreId}-DicomImport-{jobId}/SUCCESS/success.ndjson")
        contents = data['Body'].read().decode("utf-8")
        for l in contents.splitlines():
            isid = json.loads(l)['importResponse']['imageSetId']
            if isid in imageSetIds:
                imageSetIds[isid]+=1
            else:
                imageSetIds[isid]=1
except ClientError:
    pass

imageSetIds

## Save DICOM Header JSON to S3

The DICOM header includes metadata for a given ImageSetId, which is equivalent to a DICOM series. The DICOM header can be retrieved through native AHI API as a nested JSON object. We will need to parse the JSON object and save each level of information (patient, study, series, instance) into seperate S3 folder.

In [None]:
for s in imageSetIds.keys():
    json_dicom_header = medicalimaging.getMetadata(datastoreId, s)
    patient = json_dicom_header['Patient']['DICOM']
    patient['datastoreid'] = datastoreId
    patient['imagesetid'] = s
    PatientID = patient["PatientID"]
    s3.put_object(
        Body=json.dumps(patient),
        Bucket=bucket,
        Key=f'{dicom_header_prefix}json/patient/{s}.json'
    )
    study=json_dicom_header['Study']['DICOM']
    study['datastoreid'] = datastoreId
    study['imagesetid'] = s
    study['PatientID'] = PatientID
    StudyInstanceUID = study['StudyInstanceUID']
    s3.put_object(
        Body=json.dumps(study),
        Bucket=bucket,
        Key=f'{dicom_header_prefix}json/study/{s}.json'
    )
    for se in list(json_dicom_header['Study']['Series'].keys()):
        series = json_dicom_header['Study']['Series'][se]['DICOM']
        series['datastoreid'] = datastoreId
        series['imagesetid'] = s
        series['PatientID'] = PatientID
        series['StudyInstanceUID'] = StudyInstanceUID
        s3.put_object(
            Body=json.dumps(series),
            Bucket=bucket,
            Key=f'{dicom_header_prefix}json/series/{s}.json'
        )
        for i in list(json_dicom_header['Study']['Series'][se]['Instances']):
            instance = json_dicom_header['Study']['Series'][se]['Instances'][i]['DICOM']
            instance['datastoreid'] = datastoreId
            instance['imagesetid'] = s
            instance['PatientID'] = PatientID
            instance['StudyInstanceUID'] = StudyInstanceUID
            instance['SeriesInstanceUID'] = se
            instance['DICOMVRs'] = json_dicom_header['Study']['Series'][se]['Instances'][i]['DICOMVRs']
            instance['ImageFrames'] = json_dicom_header['Study']['Series'][se]['Instances'][i]['ImageFrames']
            s3.put_object(
                Body=json.dumps(instance),
                Bucket=bucket,
                Key=f'{dicom_header_prefix}json/instances/{i}.json'
            )

## Create Glue Data Catalogs using Crawler

[Amazon Glue crawler can be used to produce data catalog](https://docs.aws.amazon.com/glue/latest/dg/catalog-and-crawler.html), which is used for interactive SQL query through Amazon Athena

If you did not create the SageMaker Domain and execution role using the CloudFormation template in Pre-requisites, please make sure you have follow [the insturction](https://docs.aws.amazon.com/directoryservice/latest/admin-guide/edit_trust.html) to edit the trusted relationship for the SageMaker execution role.



In [None]:
glue_client = boto3.client('glue')
glue_crawler_name = 'DICOMHeaderJSONCrawler'
glue_database_name = 'idc_dicom_headers'

try:
    glue_client.get_crawler(Name=glue_crawler_name)
except ClientError as err:
    logging.info( "Could not get crawler Here's why: %s: %s", err.response['Error']['Code'], err.response['Error']['Message'])
    logging.info("Creating Crawler")
    glue_client.create_crawler(
        Name=glue_crawler_name,
        Role=get_execution_role(),
        DatabaseName=glue_database_name,
        Targets={'S3Targets': [{'Path': f"s3://{bucket}/{dicom_header_prefix}json/"}]})

try:
    logging.info("Starting Crawler")
    glue_client.start_crawler(Name=glue_crawler_name)
except ClientError as err:
    logging.error(
        "Couldn't start crawler %s. Here's why: %s: %s", name,
        err.response['Error']['Code'], err.response['Error']['Message'])
    raise

jobstatus = glue_client.get_crawler(Name=glue_crawler_name)['Crawler']['State']
while jobstatus != 'READY':
    time.sleep(30)
    jobstatus = glue_client.get_crawler(Name=glue_crawler_name)['Crawler']['State']
print(f"crawler jobstatus is {jobstatus}")

## Query the DICOM header using Amazon Athena

In [None]:
import requests
from pyathena import connect

# This code uses the connect() function from the pyathena library to establish a connection to Amazon Athena. 
# s3_staging_dir: The S3 URL to the staging directory for query results.
# region_name: The AWS region where the Athena resources are located.
# work_group: The name of the workgroup to use for the connection.
# schema_name: The name of the schema within Athena that you want to interact with.
conn = connect(
    s3_staging_dir=f's3://{bucket}/athena-results/',
    region_name=region,
    work_group='primary', #REPLACE WORKGROUP NAME IF ANY ERROR
    schema_name=glue_database_name,
)

## find Modality for a given ImageSetId
sql = "SELECT distinct series.studyinstanceuid, series.modality, count(series.imagesetid) as ImageSetCount from series GROUP BY 1,2 ORDER BY 3 DESC"
try:        
    # Execute the SQL query using pd.read_sql() and the established connection (conn)
    athen_result_df = pd.read_sql(sql, conn)

except pd.io.sql.DatabaseError as e:
    # Handle the exception if there's an error while executing the SQL query
    logging.error( "SQL query failed: " + sql + " Database error " + str(e) )

athen_result_df

## Image Visualization 

In [None]:
import itk
from itkwidgets import view
logging.getLogger('AHItoDICOMInterface').setLevel(logging.CRITICAL)
import warnings
warnings.filterwarnings('ignore')

from AHItoDICOMInterface.AHItoDICOM import AHItoDICOM
helper = AHItoDICOM()
instances = helper.DICOMizeImageSet(datastore_id=datastoreId , image_set_id=next(iter(imageSetIds)))

In [None]:
img = itk.image_view_from_array([ins.pixel_array for ins in instances])
viewer = view(img)
viewer.set_image_gradient_opacity(0.5)

In [None]:
%store datastoreId
%store imageSetIds

### Clean Up

In [None]:
## S3 bucket
s3 = boto3.client('s3')
try:
    objects=s3.list_objects(Bucket=bucket, Prefix=dicom_header_prefix)
    for object in objects['Contents']:
        s3.delete_object(Bucket=bucket, Key=object['Key'])
    s3.delete_object(Bucket=bucket, Key=dicom_header_prefix)
    objects=s3.list_objects(Bucket=bucket, Prefix=ahi_input_prefix)
    for object in objects['Contents']:
        s3.delete_object(Bucket=bucket, Key=object['Key'])
    s3.delete_object(Bucket=bucket, Key=ahi_input_prefix)
    objects=s3.list_objects(Bucket=bucket, Prefix=ahi_output_prefix)
    for object in objects['Contents']:
        s3.delete_object(Bucket=bucket, Key=object['Key'])
    s3.delete_object(Bucket=bucket, Key=ahi_output_prefix)
except ClientError  as e:
    logging.error(
        "Couldn't delete S3 folder %s. Here's why: %s: %s", name, err.response['Error']['Code'], err.response['Error']['Message'])
    raise


## Glue Crawler and Database
try:
    glue_client.delete_database(Name=glue_database_name)
    glue_client.delete_crawler(Name=glue_crawler_name)
except ClientError as err:
    logging.error(
        "Couldn't delete database and crawler %s. Here's why: %s: %s", name, err.response['Error']['Code'], err.response['Error']['Message'])
    raise

# ## AHI Datastore images
# for s in imageSetIds.keys():
#     medicalimaging.deleteImageSet(datastoreId, s)
# # medicalimaging.deleteDatastore(datastoreId)
