In [None]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
from sagemaker.amazon.amazon_estimator import get_image_uri

# Define IAM role
role = get_execution_role()
bucket_name = 'dgx-ds-use1-dev-landing-s3'
prefix = 'kamal/OrderTAT-xgboost'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
#xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")
xgboost_container = get_image_uri(my_region, "xgboost", "1.2-2")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

In [None]:
data_key = 'kamal/input/order_data_prep_job1.csv' 
data_location = 's3://{}/{}'.format(bucket_name, data_key) 

try:
  df = pd.read_csv(data_location, index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

In [None]:
df.head(3)

In [None]:
#target encoding
#df['TAT_HOUR'].mean()
df['ORDER_ENC'] = df.groupby('ORDER_CODE_N')['TAT_HOUR'].transform('mean')
df['ORDER_ENC'] = round((0.6 * df['ORDER_ENC']) + (0.4 * df['TAT_HOUR'].mean()),2)
df.drop('ORDER_CODE_N', axis=1, inplace=True)

In [None]:
df.head(3)

In [None]:
%%time 

model_data = pd.get_dummies(df, drop_first = True)
model_data.head(5)

In [None]:
df=[]
model_data.shape

In [None]:
#large not working for 2.4M
model_data = model_data.sample(n=2000000, random_state=11)

In [None]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=42), [int(0.8 * len(model_data))])
print(train_data.shape, test_data.shape)

In [None]:
%%time 

data_key = prefix + '/train/train.csv' 
data_location = 's3://{}/{}'.format(bucket_name, data_key) 

train_data.to_csv(data_location, index=False, header=False)
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [None]:
from datetime import datetime

dt_string = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
print("date and time =", dt_string)

In [None]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(              
                xgboost_container,
                role, 
                instance_count=1, 
                base_job_name='kamal-sagemaker-orderTAT-xgb-2xl-', 
                instance_type='ml.m5.2xlarge',
                output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                sagemaker_session=sess)
xgb.set_hyperparameters(
                max_depth=6,
                eta=0.2,
                gamma=4,
                min_child_weight=6,
                subsample=0.8,
                #silent=0,
                objective='reg:squarederror',
                num_round=100)

In [None]:
xgb.fit({'train': s3_input_train})

In [None]:
print(xgb)

In [None]:
#model store
data_key = prefix + '/model/modelXGB' 
model_loc = 's3://{}/{}'.format(bucket_name, data_key) 
print(model_loc)

import pickle
with open('modelXGB','wb') as file:
    pickle.dump(xgb,file)

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m5.large')

In [None]:
%%time 

from sagemaker.serializers import CSVSerializer
test_data_p = test_data.sample(n=3000)
test_data_array = test_data_p.drop(['TAT_HOUR'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

In [None]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

model_score = explained_variance_score(test_data_p['TAT_HOUR'], predictions_array)
print("Score = ",model_score)

k = test_data_p.shape[1]
n = len(test_data_p)
MSE = mean_squared_error(test_data_p['TAT_HOUR'], predictions_array)
MAE = mean_absolute_error(test_data_p['TAT_HOUR'], predictions_array)
r2 = r2_score(test_data_p['TAT_HOUR'], predictions_array)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
#print("MSE: %.2f" % MSE)
#print("RMSE: %.2f" % (mse**(1/2.0)))
print('RMSE =',(MSE**(1/2.0)), '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

In [None]:
import matplotlib.pyplot as plt

x_ax = range(len(test_data_p['TAT_HOUR']))
plt.scatter(test_data_p['TAT_HOUR'], predictions_array, s=5, color="blue", label="original")
plt.plot(x_ax, predictions_array, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()

In [None]:
#delete manually in dashboard
#xgb_predictor.delete_endpoint(delete_endpoint_config=True)

In [None]:
#bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
#bucket_to_delete.objects.all().delete()

In [None]:
#batch transform
data_key = prefix + '/test' 
batch_input = 's3://{}/{}'.format(bucket_name, data_key) 

data_key2 = prefix + '/batch-predict' 
batch_output = 's3://{}/{}'.format(bucket_name, data_key2) 

print(batch_input)
print(batch_output)

test_data.drop(['TAT_HOUR'], axis=1).to_csv(batch_input + '/test.csv',index=False, header=False)

In [None]:
%%time

transformer = xgb.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=batch_output
)

transformer.transform(
    data=batch_input, 
    data_type='S3Prefix',
    content_type='text/csv', 
    split_type='Line'
)
transformer.wait()

In [None]:
data_key2 = prefix + '/batch-predict/test.csv.out' 
batch_output = 's3://{}/{}'.format(bucket_name, data_key2) 
pred_y = pd.read_csv(batch_output)

In [None]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

model_score = explained_variance_score(test_data['TAT_HOUR'][1:], pred_y)
print("Score = ",model_score)

k = test_data.shape[1]
n = len(test_data)
MSE = mean_squared_error(test_data['TAT_HOUR'][1:], pred_y)
MAE = mean_absolute_error(test_data['TAT_HOUR'][1:], pred_y)
r2 = r2_score(test_data['TAT_HOUR'][1:], pred_y)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
#print("MSE: %.2f" % MSE)
#print("RMSE: %.2f" % (mse**(1/2.0)))
print('RMSE = ',(MSE**(1/2.0)), '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

In [None]:
#read model from s3
modelx = sagemaker.model.Model(
    image_uri=xgboost_container,
    model_data ='s3://dgx-ds-use1-dev-landing-s3/kamal/OrderTAT-xgboost/output/sagemaker-xgboost-2021-10-26-03-08-13-506/output/model.tar.gz',
    role=role)

print(modelx)