In [4]:
import os
import pandas as pd
import numpy as np

from sklearn import model_selection

import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

import clean_data


raw = pd.read_csv('05_pretrain_wID.csv')
raw.drop(columns = {'Unnamed: 0'}, inplace = True)

In [5]:
raw.head()

Unnamed: 0,ID,score,type,high_credit,balance,active,status,overdue,status_payment
0,44890d77-94da-49a5-8eef-0bdcf7f65979,373,EDU,18326,0,False,PAID,0,ON TIME
1,44890d77-94da-49a5-8eef-0bdcf7f65979,373,EDU,18326,0,False,PAID,0,ON TIME
2,44890d77-94da-49a5-8eef-0bdcf7f65979,373,EDU,18326,0,False,PAID,0,ON TIME
3,44890d77-94da-49a5-8eef-0bdcf7f65979,373,EDU,18326,0,False,PAID,0,ON TIME
4,44890d77-94da-49a5-8eef-0bdcf7f65979,373,EDU,18326,0,False,PAID,0,ON TIME


In [6]:
# call Preprocess
prepro_in = clean_data.PreprocessBaseline(raw)
df, X = prepro_in()

In [27]:
train = df.iloc[:1900060]
val = df.iloc[1900060:]
train.drop(columns = {'ID'}, inplace = True)
val.drop(columns = {'ID'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [30]:
# model
data_dir = '../data-prod'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

train.to_csv(os.path.join(data_dir, 'train.csv'), header = None, index = None)
val.to_csv(os.path.join(data_dir, 'validation.csv'), header = None, index = None)



session = sagemaker.Session()

prefix = '02-model'
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), 
                                     key_prefix = prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), 
                                   key_prefix = prefix)

container = get_image_uri(session.boto_region_name, 'xgboost')
role = get_execution_role()

xgb = sagemaker.estimator.Estimator(container, role, train_instance_count=1,
                                   train_instance_type = 'ml.m4.10xlarge',
                                   output_path = 's3://{}/{}/output'.format(session.default_bucket(), prefix),
                                   sagemaker_session = session)


xgb.set_hyperparameters(max_depth=5, eta=0.2, gamma=4, min_child_weight=6,
                       subsample=0.6, objective='reg:linear', num_round=200)


xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, 
                                               objective_metric_name = 'validation:rmse', 
                                               objective_type = 'Minimize', 
                                               max_jobs = 20, 
                                               max_parallel_jobs = 1, 
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

# wrapper
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type = 'csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

xgb_hyperparameter_tuner.fit({'train' : s3_input_train, 'validation' : s3_input_validation})


xgb_hyperparameter_tuner.wait()

	get_image_uri(region, 'xgboost', '0.90-1').


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [31]:
xgb_hyperparameter_tuner.best_training_job()

'xgboost-200702-2126-013-67dc7500'

In [32]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())


2020-07-02 22:30:23 Starting - Preparing the instances for training
2020-07-02 22:30:23 Downloading - Downloading input data
2020-07-02 22:30:23 Training - Training image download completed. Training in progress.
2020-07-02 22:30:23 Uploading - Uploading generated training model
2020-07-02 22:30:23 Completed - Training job completed[34mArguments: train[0m
[34m[2020-07-02:22:29:36:INFO] Running standalone xgboost training.[0m
[34m[2020-07-02:22:29:36:INFO] Setting up HPO optimized metric to be : rmse[0m
[34m[2020-07-02:22:29:36:INFO] File size need to be processed in the node: 209.9mb. Available memory size in the node: 152511.04mb[0m
[34m[2020-07-02:22:29:36:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:29:36] S3DistributionType set as FullyReplicated[0m
[34m[22:29:36] 1900060x14 matrix with 26600840 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-02:22:29:36:INFO] Determined delimiter of CSV input is ','[0m

In [33]:
# get model back
session = sagemaker.Session()
role = get_execution_role()
container = get_image_uri(session.boto_region_name, 'xgboost')
xgb_attached = sagemaker.model.Model(model_data = 's3://sagemaker-us-east-1-922059106485/00-data/output/xgboost-200630-0734-018-5c272122/output/model.tar.gz',
                                    image= container,
                                    role = role)
# deploy
xgb_predictor = xgb_attached.deploy(initial_instance_count=1, instance_type = 'ml.t2.medium')

	get_image_uri(region, 'xgboost', '0.90-1').


---------------------!

In [None]:
xgb_predictor.endpoint