### Data Processing under Conda 

* install necessary pacakges under conda env tensorflow_p36 
* to save time, use validation dataset as training set 
 

In [None]:
%%bash 
pip install nibabel opencv-python matplotlib keras==2.3.1 
mkdir -p h5 
mkdir -p h5/detection


In [None]:
#change to previous 5 sites to reduce execution time 
!python data_load.py

In [None]:
!python Stroke_segment.py local train 

### Build D-Unet container 

* Dockerfile - install necessary packages and setup entrypoints 
* build_and_push - communicate with ECR (ElasticContainerRegistry)


In [None]:
%%bash 
cd container 
./build_and_push.sh 

In [None]:
import boto3 
client = boto3.client("sts")
account_id = client.get_caller_identity()["Account"]
image_uri = "{}.dkr.ecr.{}.amazonaws.com/dunet".format(account_id, "us-west-2")
image_uri

### Training locally based on the docker image 

In [None]:
%%bash 
mkdir -p input
mkdir -p input/data
mkdir -p input/data/atlas
mv h5 input/data/atlas

In [None]:
!nvidia-docker run -it -v $PWD:/opt/ml 230755935769.dkr.ecr.us-west-2.amazonaws.com/dunet

### Use SageMaker Training Jobs 

In [None]:
import sagemaker
from sagemaker import get_execution_role
role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = "dunet"

#### upload data to s3 

In [None]:
!cd ./input/data/ && aws s3 cp --recursive atlas s3://{bucket}/{prefix}/atlas

#### define s3 input and output paths 

In [None]:
atlas_h5_data = "s3://{}/{}/atlas/".format(bucket, prefix)
outpath = "s3://{}/{}/output/".format(bucket, prefix)
repositoryUri = image_uri

#### define job_name and  and hyperparameters

In [None]:
from datetime import datetime
now = datetime.now()
timestamp = datetime.timestamp(now)
job_name = "dunet-{}".format(str(int(timestamp))) 
job_name 

#### submit training job 

In [None]:
atlas_input = sagemaker.inputs.TrainingInput(atlas_h5_data)

In [None]:


estimator = sagemaker.estimator.Estimator( 
                        role=role,
                        image_uri=repositoryUri,
                        instance_count=1,
                        instance_type='ml.p3.8xlarge',
                        sagemaker_session=sagemaker_session,
                        volume_size=100, 
                        debugger_hook_config=False
                   )


In [None]:
estimator.fit(inputs={"atlas":atlas_input}, job_name=job_name)

...........................
2021-05-03 12:39:21 Training - Downloading the training image...............
2021-05-03 12:41:44 Training - Training image download completed. Training in progress.[34mUsing TensorFlow backend.[0m
[0m
[34m['Stroke_segment.py', 'sagemaker', 'train', 'train'][0m
[34mInstructions for updating:[0m
[34mIf using Keras pass *_constraint arguments to layers.[0m
[34m2021-05-03 12:41:45.993748: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1[0m
[34m2021-05-03 12:41:46.415892: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero[0m
[34m2021-05-03 12:41:46.417247: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties: [0m
[34mname: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53[0m
[34mpciBusID: 0000:

[0m
[34mEpoch 1/150[0m
[34m2021-05-03 12:44:05.394927: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0[0m
[34m2021-05-03 12:44:08.438524: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7[0m
[34m - 153s - loss: 4.6615 - dice_coef: 0.0147 - val_loss: 4.2354 - val_dice_coef: 0.0139[0m
[34mEpoch 2/150[0m
[34m - 123s - loss: 4.6225 - dice_coef: 0.0150 - val_loss: 3.7540 - val_dice_coef: 0.0168[0m
[34mEpoch 3/150[0m
[34m - 121s - loss: 4.4331 - dice_coef: 0.0173 - val_loss: 4.4842 - val_dice_coef: 0.0194[0m
[34mEpoch 4/150[0m
[34m - 120s - loss: 4.3249 - dice_coef: 0.0195 - val_loss: 4.5053 - val_dice_coef: 0.0225[0m
[34mEpoch 5/150[0m
[34m - 120s - loss: 4.1401 - dice_coef: 0.0231 - val_loss: 3.3777 - val_dice_coef: 0.0268[0m
[34mEpoch 6/150[0m
[34m - 121s - loss: 3.9954 - dice_coef: 0.0260 - val_loss: 3.9846 - val_dice_coef: 0.03

[34m - 120s - loss: 0.6668 - dice_coef: 0.5659 - val_loss: 1.0092 - val_dice_coef: 0.4623[0m
[34mEpoch 59/150[0m
[34m - 120s - loss: 0.7214 - dice_coef: 0.5609 - val_loss: 0.3077 - val_dice_coef: 0.5677[0m
[34mEpoch 60/150[0m
[34m - 120s - loss: 0.6103 - dice_coef: 0.5890 - val_loss: 0.7484 - val_dice_coef: 0.3719
[0m
[34mEpoch 00060: saving model to /opt/ml//model/DUnet/DUnet-60-0.37.hdf5[0m
[34mEpoch 61/150[0m
[34m - 120s - loss: 0.6258 - dice_coef: 0.5774 - val_loss: 0.5647 - val_dice_coef: 0.4658[0m
[34mEpoch 62/150[0m
[34m - 121s - loss: 0.6339 - dice_coef: 0.5898 - val_loss: 3.3210 - val_dice_coef: 0.1509[0m
[34mEpoch 63/150[0m
[34m - 120s - loss: 0.6418 - dice_coef: 0.5788 - val_loss: 1.2752 - val_dice_coef: 0.2324[0m
[34mEpoch 64/150[0m
[34m - 121s - loss: 0.6083 - dice_coef: 0.5929 - val_loss: 1.4551 - val_dice_coef: 0.5147[0m
[34mEpoch 65/150[0m
[34m - 121s - loss: 0.6111 - dice_coef: 0.5916 - val_loss: 0.3051 - val_dice_coef: 0.5048[0m
[34mEp