In [None]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
from sagemaker.amazon.amazon_estimator import get_image_uri
from scipy.stats import zscore

# Define IAM role
role = get_execution_role()
bucket_name = 'dgx-ds-use1-dev-landing-s3'
prefix = 'kamal/OrderTAT-tf'
my_region = boto3.session.Session().region_name # set the region of the instance



In [None]:
data_key = 'kamal/input/order_data_prep_job1.csv' 
data_location = 's3://{}/{}'.format(bucket_name, data_key) 

try:
  df = pd.read_csv(data_location, index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

In [None]:
#df[((df['TAT_HOUR']>2000) & (df['TAT_HOUR']<3000))]
df.head(3)

In [None]:
#target encoding Order code
#df['TAT_HOUR'].mean()
df['ORDER_ENC'] = df.groupby('ORDER_CODE_N')['TAT_HOUR'].transform('mean')
df['ORDER_ENC'] = round((0.6 * df['ORDER_ENC']) + (0.4 * df['TAT_HOUR'].mean()),2)
#df['Distance'] = df['Distance']/1000
df['Distance'] = zscore(df['Distance'])
df['ORDER_ENC'] = zscore(df['ORDER_ENC'])
df.drop('ORDER_CODE_N', axis=1, inplace=True)

In [None]:
#try removing outliers
#df=df[~((df['TAT_HOUR']>df['TAT_HOUR'].quantile(.998)) | (df['TAT_HOUR']<0))] #loss reduced from 7000 to 900
df=df[~((df['TAT_HOUR']>df['TAT_HOUR'].quantile(.99)) | (df['TAT_HOUR']<0))] #loss reduced from 7000 to 900

In [None]:
#label encoding order code and perf lab


In [None]:
df.head(3)

In [None]:
%%time 

model_data = pd.get_dummies(df, drop_first = True)
model_data.head(5)

In [None]:
model_data.shape

In [None]:
model_data.describe()

In [None]:
data_key = 'kamal/input/order_data_tf.csv' 
data_location = 's3://{}/{}'.format(bucket_name, data_key) 
print(data_location)

model_data.to_csv(data_location, index=False)

In [None]:
x=pd.read_csv(data_location, nrows=10) #, index_col=0)
x.shape

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name

role = get_execution_role()
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.4xlarge", instance_count=1
)

input_data=data_location

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from datetime import datetime

dt_string = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

sklearn_processor.run(
    job_name="kamal-tf-fileprep-" + dt_string,
    code="OrderTAT-tf-fileprep.py",
    inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train", destination="s3://dgx-ds-use1-dev-landing-s3/kamal/tf-keras-orderTAT/data/train"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test", destination="s3://dgx-ds-use1-dev-landing-s3/kamal/tf-keras-orderTAT/data/test"),
    ],
    arguments=["--train-test-split-ratio", "0.2"],
)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "test_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]
        

In [None]:
#large not working for 2.4M
model_data = model_data.sample(n=1000000, random_state=11)

In [None]:
x_columns = model_data.columns
print(x_columns)

In [None]:
# Convert to numpy - Classification
x = model_data[x_columns[1:]].values
y = model_data[x_columns[:1]].values
print("x,y shape",x.shape, y.shape)

In [None]:
#DO NOT run for fileprep processing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2) 

In [None]:
#DO NOT run for fileprep processing
data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)

csv_test_dir = os.path.join(os.getcwd(), 'data/csv-test')
os.makedirs(csv_test_dir, exist_ok=True)

np.save(os.path.join(train_dir, 'x_train.npy'), x_train)
np.save(os.path.join(train_dir, 'y_train.npy'), y_train)
np.save(os.path.join(test_dir, 'x_test.npy'), x_test)
np.save(os.path.join(test_dir, 'y_test.npy'), y_test)
np.savetxt(os.path.join(csv_test_dir, 'csv-test.csv'), np.array(x_test[:1], dtype=np.int32), fmt='%d', delimiter=",")

In [None]:
import sagemaker

s3_prefix = 'kamal/tf-keras-orderTAT'
bucket = 'dgx-ds-use1-dev-landing-s3'

traindata_s3_prefix = '/{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '/{}/data/test'.format(s3_prefix)

train_s3 = 's3://' + bucket + traindata_s3_prefix
test_s3 = 's3://' + bucket + testdata_s3_prefix

inputs = {'train': train_s3, 'test': test_s3}
print(inputs)

In [None]:
#DO NOT run
train_s3 = sagemaker.Session().upload_data(path='./data/train/', bucket=bucket, key_prefix=traindata_s3_prefix)
test_s3 = sagemaker.Session().upload_data(path='./data/test/', bucket=bucket, key_prefix=testdata_s3_prefix)

inputs = {'train':train_s3, 'test': test_s3}
print(inputs)

In [None]:
from sagemaker.tensorflow import TensorFlow

model_dir = '/opt/ml/model'
train_instance_type = 'ml.m5.4xlarge'
hyperparameters = {'epochs': 10, 'batch_size': 16, 'learning_rate': 0.1}

estimator = TensorFlow(
                       #git_config=git_config,
                       #source_dir='tf-sentiment-script-mode',
                       entry_point='OrderTAT-tf-model1.py',
                       model_dir=model_dir,
                       instance_type=train_instance_type,
                       instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='kamal-tf-orderTAT',
                       framework_version='2.1',
                       py_version='py3',
                       script_mode=True)

In [None]:
estimator.fit(inputs)

In [None]:
import json 

!aws s3 cp {estimator.model_data} ./model/model.tar.gz
!tar -xzf ./model/model.tar.gz -C ./model

with open('./model/history.p', "r") as f:
    history = json.load(f)

In [None]:
import matplotlib.pyplot as plt

def plot_training_curves(history): 
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True)
    ax = axes[0]
    ax.plot(history['accuracy'], label='train')
    ax.set(title='model accuracy', ylabel='accuracy', xlabel='epoch')
    ax.legend()
    ax = axes[1]
    ax.plot(history['loss'], label='train')
    ax.set(title='model loss', ylabel='loss', xlabel='epoch')
    ax.legend()
    fig.tight_layout()
    
plot_training_curves(history)

In [None]:
#batch prediction
csvtestdata_s3_prefix = '{}/data/csv-test'.format(s3_prefix)
csvtest_s3 = sagemaker.Session().upload_data(path='./data/csv-test/', bucket=bucket, key_prefix=csvtestdata_s3_prefix)
print(csvtest_s3)

In [None]:
estimator

In [None]:
transformer = estimator.transformer(instance_count=1,                                    
                                    #base_job_name='kamal-tf-sentiment-tfm',                                    
                                    instance_type='ml.m5.4xlarge')

transformer.transform(csvtest_s3, content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()

In [None]:
#batch transform
data_key = prefix + '/test' 
batch_input = 's3://{}/{}'.format(bucket_name, data_key) 

data_key2 = prefix + '/batch-predict' 
batch_output = 's3://{}/{}'.format(bucket_name, data_key2) 

print(batch_input)
print(batch_output)

print(x_train.shape, x_test.shape)

xinp=pd.DataFrame(x_test)
xinp.to_csv(batch_input + '/test.csv',index=False, header=False)

In [None]:
%%time

transformer = estimator.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=batch_output
)

transformer.transform(
    data=batch_input, 
    data_type='S3Prefix',
    content_type='text/csv', 
    split_type='Line'
)
transformer.wait()

In [None]:
data_key2 = prefix + '/batch-predict' 
batch_output = 's3://{}/{}'.format(bucket_name, data_key2) 
print(batch_output)
data_key2 = prefix + '/batch-predict/test.csv.out' 
batch_output = 's3://{}/{}'.format(bucket, data_key2) 
print(batch_output)
pred_y = pd.read_csv(batch_output,nrows=10)

In [None]:
!aws s3 cp 's3://dgx-ds-use1-dev-landing-s3/kamal/OrderTAT-tf/batch-predict/test.csv.out' .

In [None]:
batch_output = 's3://{}'.format(bucket_name)
print(batch_output)
s3_client = boto3.client('s3')
s3 = boto3.resource('s3')
locations=['kamal/OrderTAT-tf/batch-predict/test.csv.out']
for file in locations:
    s3_client.download_file(batch_output, '/kamal/batch-predict/test.csv.out' , 'output')
    #s3.Bucket(batch_output).download_file(file, os.path.basename(file))

In [None]:
pred_y.shape

In [None]:
pred_y.info()

In [None]:
y_train[:10]

In [None]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

model_score = explained_variance_score(y_train[:10], float(pred_y))
print("Score = ", model_score)

k = y_train.shape[1]
n = len(test_data)
MSE = mean_squared_error(test_data['TAT_HOUR'][1:], pred_y)
MAE = mean_absolute_error(test_data['TAT_HOUR'][1:], pred_y)
r2 = r2_score(test_data['TAT_HOUR'][1:], pred_y)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
#print("MSE: %.2f" % MSE)
#print("RMSE: %.2f" % (mse**(1/2.0)))
print('RMSE = ',(MSE**(1/2.0)), '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

In [None]:
#runs for small volume 100K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam



# Build the neural network
model = Sequential()
model.add(Dense(64, input_dim=x.shape[1], activation='relu')) # Hidden 1
model.add(Dense(32, activation='relu')) # Hidden 2
#model.add(Dense(10, activation='sigmoid')) # Hidden 2
model.add(Dense(1)) # Output

learning_rate=0.1
optimizer = Adam(learning_rate)

model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto', 
                        restore_best_weights=True)


In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),
          callbacks=[monitor],verbose=2,epochs=100)

In [None]:
model.summary()

In [None]:
model.evaluate(X_test, y_test)

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m5.large')

In [None]:
%%time 

from sagemaker.serializers import CSVSerializer
test_data_p = test_data.sample(n=3000)
test_data_array = test_data_p.drop(['TAT_HOUR'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

In [None]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

model_score = explained_variance_score(test_data_p['TAT_HOUR'], predictions_array)
print("Score = ",model_score)

k = test_data_p.shape[1]
n = len(test_data_p)
MSE = mean_squared_error(test_data_p['TAT_HOUR'], predictions_array)
MAE = mean_absolute_error(test_data_p['TAT_HOUR'], predictions_array)
r2 = r2_score(test_data_p['TAT_HOUR'], predictions_array)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
#print("MSE: %.2f" % MSE)
#print("RMSE: %.2f" % (mse**(1/2.0)))
print('RMSE =',(MSE**(1/2.0)), '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

In [None]:
import matplotlib.pyplot as plt

x_ax = range(len(test_data_p['TAT_HOUR']))
plt.scatter(test_data_p['TAT_HOUR'], predictions_array, s=5, color="blue", label="original")
plt.plot(x_ax, predictions_array, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()

In [None]:
#delete manually in dashboard
#xgb_predictor.delete_endpoint(delete_endpoint_config=True)

In [None]:
#bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
#bucket_to_delete.objects.all().delete()

In [None]:
#batch transform
data_key = prefix + '/test' 
batch_input = 's3://{}/{}'.format(bucket_name, data_key) 

data_key2 = prefix + '/batch-predict' 
batch_output = 's3://{}/{}'.format(bucket_name, data_key2) 

print(batch_input)
print(batch_output)

test_data.drop(['TAT_HOUR'], axis=1).to_csv(batch_input + '/test.csv',index=False, header=False)

In [None]:
%%time

transformer = xgb.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=batch_output
)

transformer.transform(
    data=batch_input, 
    data_type='S3Prefix',
    content_type='text/csv', 
    split_type='Line'
)
transformer.wait()

In [None]:
data_key2 = prefix + '/batch-predict/test.csv.out' 
batch_output = 's3://{}/{}'.format(bucket_name, data_key2) 
pred_y = pd.read_csv(batch_output)

In [None]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

model_score = explained_variance_score(test_data['TAT_HOUR'][1:], pred_y)
print("Score = ",model_score)

k = test_data.shape[1]
n = len(test_data)
MSE = mean_squared_error(test_data['TAT_HOUR'][1:], pred_y)
MAE = mean_absolute_error(test_data['TAT_HOUR'][1:], pred_y)
r2 = r2_score(test_data['TAT_HOUR'][1:], pred_y)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
#print("MSE: %.2f" % MSE)
#print("RMSE: %.2f" % (mse**(1/2.0)))
print('RMSE = ',(MSE**(1/2.0)), '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

In [None]:
#read model from s3
modelx = sagemaker.model.Model(
    image_uri=xgboost_container,
    model_data ='s3://dgx-ds-use1-dev-landing-s3/kamal/OrderTAT-xgboost/output/sagemaker-xgboost-2021-10-26-03-08-13-506/output/model.tar.gz',
    role=role)

print(modelx)