In [None]:
%%sh

# The name of our algorithm
algorithm_name=sagemaker-hdbscan

cd container

chmod +x hdbscan/train
chmod +x hdbscan/train.py
#chmod +x decision_trees/serve

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-eu-west-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

In [68]:
!docker image ls 647453829825.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-hdbscan

REPOSITORY                                                       TAG                 IMAGE ID            CREATED             SIZE
647453829825.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-hdbscan   latest              a2658bc50803        40 minutes ago      608MB


In [None]:
!docker image prune -f

In [None]:
#!docker image rm 0cc0f931da19
!docker rmi $(docker images -q)

In [None]:
%%sh
cd container/local_test/
chmod +x train_local.sh
./train_local.sh sagemaker-hdbscan

In [None]:
#!pip uninstall -y enum34
#!pip install hdbscan
import pickle 
import hdbscan
with open("container/local_test/test_dir/model/hdbscan-model.pkl", 'rb') as f:
    model = pickle.load(f)
model.labels_

In [None]:
#!pip install mxnet
# import mxnet as mx
# model_params = mx.nd.load('container/local_test/test_dir/model/hdbscan-model.pkl')
# print(model_params)

In [None]:
# S3 prefix
prefix = 'hdbscan-clust'

# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
import sagemaker as sage
from time import gmtime, strftime

sess = sage.Session()

In [None]:
WORK_DIRECTORY = 'container/local_test/test_dir/input/data/training/'

data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=prefix)
data_location

In [None]:
# sts = security token service
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name
image = '{}.dkr.ecr.{}.amazonaws.com/sagemaker-hdbscan:latest'.format(account, region)
print(image)

In [None]:
%%time
hyperparams = {
   "min_cluster_size":"50",
   "core_dist_n_jobs":"2"
}
clust = sage.estimator.Estimator(image_name=image,
                                role=role,
                                hyperparameters=hyperparams,
                                train_instance_count=1,
                                train_instance_type='ml.c4.2xlarge',
                                output_path="s3://{}/output".format(sess.default_bucket()),
                                sagemaker_session=sess)

clust.fit(data_location)

In [None]:
clust.latest_training_job.name
s3 = boto3.client('s3')
s3.download_file(sess.default_bucket(), 'output/{}/output/{}'.format(clust.latest_training_job.name, 'model.tar.gz'), 'model.pkl')

In [None]:
import pickle 
import hdbscan
with open("container/local_test/test_dir/model/hdbscan-model.pkl", 'rb') as f:
    model = pickle.load(f)
model.labels_