In [None]:
!pip install --upgrade pandas

In [None]:
import sagemaker
bucket=sagemaker.Session().default_bucket()
prefix = 'AIMLwithAWS/Chapter5'
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
import numpy as np 
import pandas as pd   
from IPython.display import display 
from time import gmtime, strftime 
import sys 
import math 
import json 
import os 
import sagemaker 
import zipfile 

In [None]:
pd.__version__

In [None]:
s3 = boto3.client("s3")
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

data = pd.read_csv("./winequality-white.csv", delimiter=";")
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 50)

In [None]:
data = pd.read_csv("./winequality-white.csv", delimiter=";")
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 50)
data

In [None]:
data.columns = [
"fixed acidity",
"volatile acidity",
"citric acid",
"residual sugar",
"Chlorides",
"free sulfur dioxide",
"total sulfur dioxide",
"Density",
"pH",
"Sulphates",
"Alcohol",
"quality"
]

In [None]:
print(data.shape)
display(data.head())
display(data.describe())
display(data.quality.value_counts())

In [None]:
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')

In [None]:
train_data, validation_data, test_data = np.split(data.sample(frac=1, random_state=5621), [int(0.7 * len(data)), int(0.9 * len(data))])

In [None]:
pd.concat([train_data['quality'], train_data.drop(['quality'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([validation_data['quality'], validation_data.drop(['quality'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)

In [None]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [None]:
session = sagemaker.Session()
xgb_estimator = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=session)

xgb_estimator.set_hyperparameters(
        max_depth=4,
        eta=0.3,
        gamma=3,
        min_child_weight=7,
        subsample=0.6,
        verbosity=1,
        objective='reg:linear',
        num_round=50)

xgb_estimator.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'alpha': ContinuousParameter(0, 2), 
                         'min_child_weight': ContinuousParameter(1, 10), 
                         'subsample': ContinuousParameter(0.5, 1), 
                         'eta': ContinuousParameter(0, 1),
                         'num_round': IntegerParameter(1, 4000)
                        }

In [None]:
objective_metric_name = 'validation:rmse'
objective_type = 'Minimize'

In [None]:
xgb_hp_tuner = HyperparameterTuner(xgb_estimator,
                            objective_metric_name = objective_metric_name,
                            objective_type = objective_type,
                            hyperparameter_ranges = hyperparameter_ranges,
                            max_jobs=5,
                            max_parallel_jobs=5)


In [None]:
xgb_hp_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=xgb_hp_tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

In [None]:
xgb_hp_tuner.best_training_job()