#### Import some stuff

In [1]:
import pandas as pd
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

#### Initialize session, role, region, and conatiner

In [5]:
sess = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name
container = get_image_uri(boto3.Session().region_name, 'xgboost')

#### Load Data

In [16]:
bucket='bryan-predictive-maintenance' 
prefix = 'sagemaker/'

def load_from_s3(fname):
    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket=bucket, Key="{}{}.csv".format(prefix, fname))
    file = response["Body"]
    return pd.read_csv(file, low_memory=False)

df = load_from_s3("train001")
df.head()

Unnamed: 0,ENGINE_NUMBER,TIME_IN_CYCLES,OPERATIONAL_SETTING_1,OPERATIONAL_SETTING_2,OPERATIONAL_SETTING_3,SENSOR_MEASUREMENT_2,SENSOR_MEASUREMENT_3,SENSOR_MEASUREMENT_4,SENSOR_MEASUREMENT_6,SENSOR_MEASUREMENT_7,SENSOR_MEASUREMENT_8,SENSOR_MEASUREMENT_9,SENSOR_MEASUREMENT_11,SENSOR_MEASUREMENT_12,SENSOR_MEASUREMENT_13,SENSOR_MEASUREMENT_14,SENSOR_MEASUREMENT_15,SENSOR_MEASUREMENT_17,SENSOR_MEASUREMENT_20,SENSOR_MEASUREMENT_21
0,1,1,-0.0007,-0.0004,100.0,641.82,1589.7,1400.6,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,642.15,1591.82,1403.14,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,642.35,1587.99,1404.2,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442
3,1,4,0.0007,0.0,100.0,642.35,1582.79,1401.87,21.61,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,642.37,1582.85,1406.22,21.61,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044


#### Split training data

In [12]:
bucket='bryan-predictive-maintenance' 
prefix = 'sagemaker/'
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket, Key="{}train001.csv".format(prefix))
file = response["Body"]

df = pd.read_csv(file, low_memory=False)
df.head()

Unnamed: 0,ENGINE_NUMBER,TIME_IN_CYCLES,OPERATIONAL_SETTING_1,OPERATIONAL_SETTING_2,OPERATIONAL_SETTING_3,SENSOR_MEASUREMENT_2,SENSOR_MEASUREMENT_3,SENSOR_MEASUREMENT_4,SENSOR_MEASUREMENT_6,SENSOR_MEASUREMENT_7,SENSOR_MEASUREMENT_8,SENSOR_MEASUREMENT_9,SENSOR_MEASUREMENT_11,SENSOR_MEASUREMENT_12,SENSOR_MEASUREMENT_13,SENSOR_MEASUREMENT_14,SENSOR_MEASUREMENT_15,SENSOR_MEASUREMENT_17,SENSOR_MEASUREMENT_20,SENSOR_MEASUREMENT_21
0,1,1,-0.0007,-0.0004,100.0,641.82,1589.7,1400.6,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,642.15,1591.82,1403.14,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,642.35,1587.99,1404.2,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442
3,1,4,0.0007,0.0,100.0,642.35,1582.79,1401.87,21.61,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,642.37,1582.85,1406.22,21.61,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044


#### Stealing code

In [None]:
%%time
import boto3
from time import gmtime, strftime

job_name = 'bryan-was-here-xgboost-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

#Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": bucket_path + "/" + prefix + "/single-xgboost"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:linear",
        "num_round":"50"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/train',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/validation',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker')
client.create_training_job(**create_training_params)

import time

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)