In [8]:
# Optional: add the adapter project root to Python path if not installed.
import sys
import os

adapter_path = os.path.join(os.environ["HOME"], "valohai-sagemaker-adapter")
if adapter_path not in sys.path:
    sys.path.append(adapter_path)

In [9]:
# Imports
from valohai_sagemaker.code_container import CodeContainer
import valohai_sagemaker.docker as docker

In [10]:
# Instantiate a container called "test-image" and add to it the training file "train.py".
# Using "ufoym/deepo:pytorch" as an inherited Docker image (FROM command)
# and installs the pip packages "numpy", etc. to the image.
image = docker.Image(
    CodeContainer(
        "test-container",
        files_to_copy=["train.py"],
        pip_packages=["numpy", "scipy", "scikit-learn"]
    ),
    froms=["ufoym/deepo:pytorch"]
)

In [4]:
# Build the docker image locally. 
# This step is necessary, 
# but can be avoided, as it is called inside `image.push()` 
# which is called in `adapter.create_estimator()`.
image.build()

Login Succeeded
Sending build context to Docker daemon  28.67kB
Step 1/10 : FROM ubuntu:16.04
 ---> 0b1edfbffd27
Step 2/10 : FROM ufoym/deepo:pytorch
 ---> e3bf38883164
Step 3/10 : RUN apt-get -y update && apt-get install -y --no-install-recommends          wget          nginx          ca-certificates     && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> fa89ec2231aa
Step 4/10 : RUN pip3.6 install flask gevent gunicorn && rm -rf /root/.cache
 ---> Using cache
 ---> b959e2a43c66
Step 5/10 : RUN pip3.6 install numpy scipy scikit-learn && rm -rf /root/.cache
 ---> Using cache
 ---> 37aee3517ce0
Step 6/10 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 3660375254e7
Step 7/10 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> 2ac9a0a537aa
Step 8/10 : ENV PATH="/opt/program:${PATH}"
 ---> Using cache
 ---> 40de214a95e8
Step 9/10 : COPY model /opt/program
 ---> Using cache
 ---> 8279d17d756b
Step 10/10 : WORKDIR /opt/program
 ---> Using cache
 ---> 02b2b4097ce8
Successfully

In [5]:
# Local training, not necessary if you only want to train in a dedicated instance.
image.train()

hello from docker sagemaker
argv: ['/opt/program/train.py']
device:  cpu
in the middle
0 741.7901000976562
1 683.873046875
2 635.3046264648438
3 594.1837768554688
4 557.6737060546875
5 524.572265625
6 494.839599609375
7 467.61199951171875
8 442.3117980957031
9 418.59722900390625
10 396.4313659667969
11 375.593994140625
12 356.0154724121094
13 337.5177307128906
14 320.03326416015625
15 303.393798828125
16 287.4770812988281
17 272.2982482910156
18 257.8002624511719
19 243.94674682617188
20 230.71742248535156
21 218.09413146972656
22 206.04310607910156
23 194.51222229003906
24 183.49380493164062
25 172.9560546875
26 162.90655517578125
27 153.4054412841797
28 144.3697052001953
29 135.77975463867188
30 127.62995147705078
31 119.95418548583984
32 112.69477081298828
33 105.84770965576172
34 99.38553619384766
35 93.29308319091797
36 87.53601837158203
37 82.12042236328125
38 77.02863311767578
39 72.25255584716797
40 67.7778091430664
41 63.588523864746094
42 59.65644073486328
43 55.9641227722168

In [6]:
# Imports
from valohai_sagemaker.sagemaker import SageMakerAdapter

In [7]:
# Instantiate the SageMaker Docker image adapter for a remote instance.
adapter = SageMakerAdapter(image)

In [9]:
# Create a SageMaker SDK estimator using the image.
# The output_path is optional, but it tells SageMaker where to upload the trained model.
# The model must be saved in "/opt/ml/model" in the training process.
estimator = adapter.create_estimator(
    train_instance_type="ml.c5.2xlarge",
    output_path="s3://some/output/path/on/s3"
)

Login Succeeded
Sending build context to Docker daemon  28.67kB
Step 1/10 : FROM ubuntu:16.04
 ---> 0b1edfbffd27
Step 2/10 : FROM ufoym/deepo:pytorch
 ---> e3bf38883164
Step 3/10 : RUN apt-get -y update && apt-get install -y --no-install-recommends          wget          nginx          ca-certificates     && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> fa89ec2231aa
Step 4/10 : RUN pip3.6 install flask gevent gunicorn && rm -rf /root/.cache
 ---> Using cache
 ---> b959e2a43c66
Step 5/10 : RUN pip3.6 install numpy scipy scikit-learn && rm -rf /root/.cache
 ---> Using cache
 ---> 37aee3517ce0
Step 6/10 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 3660375254e7
Step 7/10 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> 2ac9a0a537aa
Step 8/10 : ENV PATH="/opt/program:${PATH}"
 ---> Using cache
 ---> 40de214a95e8
Step 9/10 : COPY model /opt/program
 ---> Using cache
 ---> 8279d17d756b
Step 10/10 : WORKDIR /opt/program
 ---> Using cache
 ---> 02b2b4097ce8
Successfully b

In [10]:
# The rest is all SageMaker specific, here is how to launch a Job and not wait 
# for it in the notebook.
#
# Push and run the image remotely using the SageMaker SDK
# the model, if saved under "/opt/ml/model/", will be 
# uploaded automatically to S3 in .tar.gz format.
# The inputs of fit() is usually a str or dict to specify channels
# of data to copy to the container in "/opt/ml/input/data/".
# For e.g. here the s3 path would be copied to "/opt/ml/input/data/training".
#
estimator.fit({
    "training": "s3://some/path/in/s3/to/copy"
}, wait=False)

INFO:sagemaker:Creating training-job with name: test-image-2018-05-10-22-18-38-809


................................................
[31mhello from docker sagemaker[0m
[31margv: ['/opt/program/train.py'][0m
[31mdevice:  cpu[0m
[31min the middle[0m
[31m0 686.819580078125[0m
[31m1 639.234375[0m
[31m2 597.58251953125[0m
[31m3 560.6435546875[0m
[31m4 527.6019897460938[0m
[31m5 497.6590270996094[0m
[31m6 470.3472595214844[0m
[31m7 445.128662109375[0m
[31m8 421.7899169921875[0m
[31m9 400.19378662109375[0m
[31m10 379.884033203125[0m
[31m11 360.6611328125[0m
[31m12 342.5826721191406[0m
[31m13 325.43267822265625[0m
[31m14 309.1417541503906[0m
[31m15 293.6123046875[0m
[31m16 278.83001708984375[0m
[31m17 264.6995849609375[0m
[31m18 251.1197509765625[0m
[31m19 238.10903930664062[0m
[31m20 225.61582946777344[0m
[31m21 213.64715576171875[0m
[31m22 202.2110137939453[0m
[31m23 191.2910919189453[0m
[31m24 180.86209106445312[0m
[31m25 170.97122192382812[0m
[31m26 161.53521728515625[0m
[31m27 152.54762268066406[0m
[31m28 1

===== Job Complete =====
Billable seconds: 141
