In [1]:
!pip install pyarrow
!pip install s3fs



In [None]:
import os
import boto3
import re
import sagemaker

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = '<YOUR BUCKET>'
path = '<YOUR_PREFIX>' # place to upload training files within the bucket

Now we'll import the Python libraries we'll need.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac
import s3fs
import pyarrow.parquet as pq


In [None]:
fs = s3fs.S3FileSystem()
bucket_uri = f's3://{bucket}/{path}'
dataset = pq.ParquetDataset(bucket_uri, filesystem=fs)
table = dataset.read()
df = table.to_pandas() 

# move shape to first position and categorize
first_col = df.pop('Shape')
df.insert(0, 'Shape', first_col)
df['Shape'].astype('category').cat.codes

# print the shape of the data file
print(df.shape)

# show the top few rows
display(df.head())

# describe the data object
display(df.describe())



## Create Features and Labels
#### Split the data into 70% training, 30% testing.

In [None]:
# Split the data randomly as 70% for training and remaining 30% and save them locally
mtrain_list = np.random.rand(len(df)) < 0.7
mdata_train = df[mtrain_list]
mdata_val = df[~mtrain_list]
mdata_train.to_csv("mformatted_train.csv", sep=',', header=False, index=False)         # save training data 
mdata_val.to_csv("mformatted_val.csv", sep=',', header=False, index=False)             # save validation data


In [None]:
mtrain_file = 'mformatted_train.csv'
mval_file = 'mformatted_val.csv'

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(path, 'train/', mtrain_file)).upload_file(mtrain_file)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(path, 'val/', mval_file)).upload_file(mval_file)

In [None]:
Mxgboost_containers = {'us-west-2' : '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
                    'us-east-1' : '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
                    'us-east-2' : '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
                    'eu-west-1' : '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

In [None]:
import boto3
from time import gmtime, strftime

mjob_name = 'Mxgboost-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", mjob_name)

mcreate_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": Mxgboost_containers[boto3.Session().region_name],
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/single-xgboost/".format(bucket, path),
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 1000
    },
    "TrainingJobName": mjob_name,
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.1",
        "gamma":"1",
        "min_child_weight":"1",
        "silent":"0",
        "objective": "multi:softmax", #for multiclass
        "num_round": "20",
        "num_class": "6",
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri":  "s3://{}/{}/train/".format(bucket, path),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/val/".format(bucket, path),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        }
    ]
}

%%time

region = boto3.Session().region_name sm = boto3.client('sagemaker')

sm.create_training_job(**mcreate_training_params)

status = sm.describe_training_job(TrainingJobName=mjob_name)['TrainingJobStatu s']
print(status) sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=mjob_name)
if status == 'Failed':
message = sm.describe_training_job(TrainingJobName=mjob_name)['FailureReas on']
print('Training failed with the following error: {}'.format(message)) raise Exception('Training job failure')

In [None]:
# After the model is trained it is possible to do some testing, 
# as this is just an example notebook we point out to other complete notebooks for prediction examples:

# https://aws.amazon.com/blogs/machine-learning/create-a-model-for-predicting-orthopedic-pathology-using-amazon-sagemaker/
# https://www.proud2becloud.com/iot-ingestion-and-ml-analytics-pipeline-with-aws-iot-kinesis-and-sagemaker/
# https://www.proud2becloud.com/a-clustering-process-with-sagemaker-experiments-a-real-world-use-case/
