In [41]:
import boto3
import joblib
import os
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sagemaker.session import Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.session import get_execution_role
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter, IntegerParameter, ContinuousParameter
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
import sagemaker

In [42]:
role = get_execution_role()
region = boto3.Session().region_name
bucket = Session().default_bucket()

In [63]:
container = image_uris.retrieve('xgboost', Session().boto_region_name, '1.2-1' )

In [44]:
s3_input_train = TrainingInput(s3_data='s3://{}/data/train-tf-44898-5000-1'.format(bucket), content_type='csv')
s3_input_test = TrainingInput(s3_data='s3://{}/data/test-tf-44898-5000-1'.format(bucket), content_type='csv')

In [33]:
xgb = Estimator(container,
                role,
                instance_count=1,
                instance_type='ml.m4.xlarge')

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_test})

2021-01-01 19:07:52 Starting - Starting the training job...
2021-01-01 19:08:15 Starting - Launching requested ML instancesProfilerReport-1609528072: InProgress
......
2021-01-01 19:09:22 Starting - Preparing the instances for training............
2021-01-01 19:11:17 Downloading - Downloading input data...
2021-01-01 19:11:49 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:roo

In [34]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

-----------------!

In [38]:
xgb_predictor.serializer = CSVSerializer()

In [36]:
validation_data = pd.read_csv('s3://{}/data/validation-tf-44898-5000-1/validation.csv'.format(bucket))
validation_y = validation_data.iloc[:, 0]
validation_X = validation_data.iloc[:, 1:]

In [20]:
xx = np.array_split(validation_X, 100)

In [41]:
def predict(data, rows=100):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [42]:
predictions = predict(validation_X.to_numpy())

In [43]:
pred_y = predictions.round()

In [44]:
accuracy_score(pred_y, validation_y)

0.9835750032080072

In [31]:
accuracy_score(pred_y, validation_y)

0.9708712947517002

In [45]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.c5.2xlarge',
                                    sagemaker_session=Session())

In [46]:
xgb.set_hyperparameters(objective='binary:logistic',
                        num_round=100,
                        rate_drop=0.3,
                        tweedie_variance_power=1.4)

In [47]:
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(1, 10)}

In [48]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name = 'validation:rmse', # The metric used to compare trained models.
                            objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                            hyperparameter_ranges=hyperparameter_ranges,
                            max_jobs=8,
                            max_parallel_jobs=8)

In [49]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_test})

...............................................................................!


In [68]:
def perform_tuning(session, role, train_data, test_data, base_dict, sweep_dict, n_jobs=10, parallel_jobs=3):
    
    print(session)
    
    container = image_uris.retrieve('xgboost', session.boto_region_name, '1.2-1')
    
    print('here')
    
    xgb = sagemaker.estimator.Estimator(
        container,
        role, 
        instance_count=1, 
        instance_type='ml.c5.2xlarge',
        sagemaker_session=session
    ) 
    
    xgb.set_hyperparameters(**base_dict)
    
    tuner = HyperparameterTuner(
        xgb,
        objective_metric_name='validation:rmse',
        objective_type='Minimize',
        hyperparameter_ranges=sweep_dict,
        max_jobs=n_jobs,
        max_parallel_jobs=parallel_jobs
    )
    
    tuner.fit({'train': train_data, 'validation': test_data})
    
    return tuner

In [89]:
def deploy_best_model(tuner):
    
    predictor = tuner.deploy(initial_instance_count=1, instance_type='ml.t2.xlarge')
    
    return predictor

In [90]:
def perform_prediction(predictor, validation_data, rows=100):
    
    predictor.serializer = CSVSerializer()
    
    validation_y = validation_data.iloc[:, 0]
    validation_X = validation_data.iloc[:, 1:]
    
    split_array = np.array_split(validation_X, int(validation_X.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [91]:
base_dict = {
    'objective': 'binary:logistic',
    'num_round': 100,
    'rate_drop':0.3,
    'tweedie_variance_power': 1.4
}

In [92]:
sweep_dict = {
    'eta': ContinuousParameter(0, 1),
    'min_child_weight': ContinuousParameter(1, 10),
    'alpha': ContinuousParameter(0, 2),
    'max_depth': IntegerParameter(1, 10)
}

In [93]:
sagemaker_session = Session()
role = get_execution_role()
bucket = sagemaker_session.default_bucket()

In [94]:
s3_input_train = TrainingInput(s3_data='s3://{}/data/train-tf-44898-5000-1'.format(bucket), content_type='csv')
s3_input_test = TrainingInput(s3_data='s3://{}/data/test-tf-44898-5000-1'.format(bucket), content_type='csv')
validation_data = pd.read_csv('s3://{}/data/validation-tf-44898-5000-1/validation.csv'.format(bucket))

In [95]:
tuner = perform_tuning(sagemaker_session, role, s3_input_train, s3_input_test, base_dict, sweep_dict, n_jobs=3, parallel_jobs=3)

<sagemaker.session.Session object at 0x7ff0e1f23710>
here
...........................................................................!


In [None]:
predictor = deploy_best_model(tuner)


2021-01-01 21:40:47 Starting - Preparing the instances for training
2021-01-01 21:40:47 Downloading - Downloading input data
2021-01-01 21:40:47 Training - Training image download completed. Training in progress.
2021-01-01 21:40:47 Uploading - Uploading generated training model
2021-01-01 21:40:47 Completed - Training job completed
----

In [88]:
result = perform_prediction(predictor, validation_data, rows=100)

KeyError: 0