In [4]:
import pandas as pd
import numpy as np

In [5]:
# Read in feature sets and corresponding outputs
X = pd.read_csv("all_features_QF_SLIA.csv")
y = pd.read_csv("times_QF_SLIA_ALL.csv")

In [6]:
from scipy.stats import zscore

# Normalize features to zero mean and unit variance
X = X.apply(zscore)

In [7]:
# Convert output values to 0 for 30-60s, 1 for 1-12m, 2 for timeout
y = y.values
y = pd.DataFrame(np.where(y == 900, 2, np.where(y > 60, 1, np.where(y >= 30, 0, -1))))

# Verifies that there were no values in the data that were outside the expect range
assert(not -1 in y.values)

In [8]:
from sklearn.model_selection import train_test_split

# Split datasets into
# training (60%)
# validation (20%)
# testing (20%)
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=0.25, random_state=1)

In [9]:
# Combine output and features
train = pd.concat([y_train, X_train], axis=1)
val = pd.concat([y_val, X_val], axis=1)
test = pd.concat([y_test, X_test], axis=1)

In [10]:
train.to_csv('train.csv', index=False, header=False)
val.to_csv('validation.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)

In [11]:
import sagemaker, boto3, os
bucket = sagemaker.Session().default_bucket()
prefix = "smt-qfslia-cvc4-runtime"

# Upload datasets to S3
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/test.csv')).upload_file('test.csv')

In [12]:
region = sagemaker.Session().boto_region_name
role = sagemaker.get_execution_role()

In [13]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

# Configure model

s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

xgb_model.set_hyperparameters(
    objective = 'multi:softprob',
    num_class = 3,
    max_depth = 6,
    eta       = 0.3,
    num_round = 100,
    alpha = 0.8,
    subsample = 0.5,
    colsample_bytree = 0.5
)

257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1


In [14]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation.csv"), content_type="csv"
)

In [15]:
# Run the training job to fit the model
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2021-08-24 17:47:04 Starting - Starting the training job...
2021-08-24 17:47:29 Starting - Launching requested ML instancesCreateXgboostReport: InProgress
ProfilerReport-1629827224: InProgress
......
2021-08-24 17:48:29 Starting - Preparing the instances for training......
2021-08-24 17:49:30 Downloading - Downloading input data...
2021-08-24 17:50:03 Training - Downloading the training image.....[34m[2021-08-24 17:50:39.176 ip-10-0-170-68.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0

In [16]:
# Get the auto-generated analytics
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

2021-08-24 17:54:46     324237 smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/CreateXgboostReport/xgboost_report.html
2021-08-24 17:54:46     120020 smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/CreateXgboostReport/xgboost_report.ipynb
2021-08-24 17:51:48     322345 smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/ProfilerReport-1629827224/profiler-output/profiler-report.html
2021-08-24 17:51:47     168688 smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/ProfilerReport-1629827224/profiler-output/profiler-report.ipynb
2021-08-24 17:51:43        190 smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/ProfilerReport-1629827224/profiler-output/profiler-reports/BatchSize.json
2021-08-24 17:51:43        198 smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47

In [17]:
# Download the auto-generated analytics
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/CreateXgboostReport/xgboost_report.ipynb to CreateXgboostReport/xgboost_report.ipynb
download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/CreateXgboostReport/xgboost_report.html to CreateXgboostReport/xgboost_report.html
download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/ProfilerReport-1629827224/profiler-output/profiler-report.ipynb to ProfilerReport-1629827224/profiler-output/profiler-report.ipynb
download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-runtime/xgboost_model/sagemaker-xgboost-2021-08-24-17-47-04-737/rule-output/ProfilerReport-1629827224/profiler-output/profiler-reports/MaxInitializationTime.json to ProfilerReport-1629827224/profiler-output/profiler-r

In [17]:
# When done training/tuning the model, deploy an endpoint to SageMaker
import sagemaker
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

---------------------!

In [1]:
import numpy as np

# This function calls the endpoint to get predictions
# from the model and processes the returned data
def predict_multi_class(data, num_class, rows=1000):
    
    assert(num_class >= 2)
    
    num_examples = data.shape[0]
    
    split_array = np.array_split(data, int(num_examples / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    
    # For binary classifiers, predict() returns a single float:
    # the probability of a positive outcome
    # formally, this means the model returns 1
    if num_class == 2:
        return np.fromstring(predictions[1:], sep=',')
    
    # Convert string version of 2D array to Python list of strings
    pred_list = predictions[1:].replace('[','').replace(']','').strip().split(',')
    
    try:
        assert(len(pred_list) == num_examples * num_class)
    except AssertionError:
        print("Something went wrong. Verify that the value of num_class is correct.")
        exit()
    
    # Convert Python list to Numpy array of floats, and reshape to 2D
    return np.array(pred_list, dtype=float).reshape([num_examples,num_class])

In [None]:
import sklearn

# Output the accuracy of the model on the test set
log_predictions = predict_multi_class(test.to_numpy()[:,1:], 3)
predictions = np.argmax(log_predictions, axis=1)
sklearn.metrics.accuracy_score(test.iloc[:,0], predictions)

In [20]:
# Output the confusion matrix for the test set
cm = sklearn.metrics.confusion_matrix(test.iloc[:,0], predictions)
cm

array([[16,  8,  1],
       [ 9, 14,  1],
       [ 1,  5, 12]])

In [27]:
# Computing feature means and stdevs for Inference Script
X_orig = pd.read_csv("all_features_QF_SLIA.csv")

In [45]:
np.set_printoptions(suppress=True) # Suppresses scientific notation
with open("feature_means.dat", 'w') as fp:
    fp.write(str(X_orig.apply(np.mean).to_numpy()))

In [46]:
np.set_printoptions(suppress=True) # Suppresses scientific notation
with open("feature_stdevs.dat", 'w') as fp:
    fp.write(str(X_orig.apply(np.std).to_numpy()))