In [1]:
import pandas as pd
import numpy as np

In [4]:
# Read in feature sets and corresponding outputs
# Some values of a_max were too large for a 64-bit number,
# so a 128-bit float had to be specified in order for the
# column to be parsed correctly (otherwise Pandas defaulted
# to parsing them as strings)
X1 = pd.read_csv("best_cvc4.csv")
X2 = pd.read_csv("best_z3.csv")
X3 = pd.read_csv("best_sat.csv", dtype={'a_max':np.float128})
X4 = pd.read_csv("best_none.csv")

y1 = pd.DataFrame(np.zeros((len(X1),1)), columns=['best_solver'])
y2 = pd.DataFrame(np.ones((len(X2),1)), columns=['best_solver'])
y3 = pd.DataFrame(np.ones((len(X3),1)) * 2, columns=['best_solver'])
y4 = pd.DataFrame(np.ones((len(X4),1)) * 3, columns=['best_solver'])

In [9]:
X = pd.concat([X1,X2,X3,X4])
y = pd.concat([y1,y2,y3,y4])

In [24]:
from sklearn.model_selection import train_test_split

# Split datasets into
# training (60%)
# validation (20%)
# testing (20%)
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=0.25, random_state=1, stratify=y_tv)

In [25]:
# Combine output and features
train = pd.concat([y_train, X_train], axis=1)
val = pd.concat([y_val, X_val], axis=1)
test = pd.concat([y_test, X_test], axis=1)

In [26]:
train.to_csv('train.csv', index=False, header=False)
val.to_csv('validation.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)

In [27]:
import sagemaker, boto3, os
bucket = sagemaker.Session().default_bucket()
prefix = "smt-eager-vs-lazy-expanded"

# Upload datasets to S3
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/test.csv')).upload_file('test.csv')

In [28]:
region = sagemaker.Session().boto_region_name
role = sagemaker.get_execution_role()

In [39]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

# Configure model

s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

xgb_model.set_hyperparameters(
    objective = 'multi:softprob',
    num_class = 4,
    num_round = 100,
    subsample = 0.8,
    colsample_bytree = 0.8
)

257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1


In [40]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation.csv"), content_type="csv"
)

In [41]:
# Run the training job to fit the model
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2021-09-02 16:58:32 Starting - Starting the training job...
2021-09-02 16:58:55 Starting - Launching requested ML instancesCreateXgboostReport: InProgress
ProfilerReport-1630601912: InProgress
......
2021-09-02 16:59:55 Starting - Preparing the instances for training.........
2021-09-02 17:01:32 Downloading - Downloading input data
2021-09-02 17:01:32 Training - Downloading the training image......
2021-09-02 17:02:29 Uploading - Uploading generated training model[34m[2021-09-02 17:02:25.778 ip-10-0-125-205.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorith

In [42]:
# Get the auto-generated analytics
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

2021-09-02 17:06:52       1608 smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost-reports/FeatureImportance.json
2021-09-02 17:06:52        315 smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost-reports/LabelsDistribution.json
2021-09-02 17:06:52       3651 smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost-reports/LossData.json
2021-09-02 17:06:51     322547 smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost_report.html
2021-09-02 17:06:50     118103 smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost_report.ipynb
2021-09-02 17:02:54     329708 smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-o

In [43]:
# Download the auto-generated analytics
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://sagemaker-us-east-2-736959812641/smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost-reports/LabelsDistribution.json to CreateXgboostReport/xgboost-reports/LabelsDistribution.json
download: s3://sagemaker-us-east-2-736959812641/smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost-reports/LossData.json to CreateXgboostReport/xgboost-reports/LossData.json
download: s3://sagemaker-us-east-2-736959812641/smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/CreateXgboostReport/xgboost-reports/FeatureImportance.json to CreateXgboostReport/xgboost-reports/FeatureImportance.json
download: s3://sagemaker-us-east-2-736959812641/smt-eager-vs-lazy-expanded/xgboost_model/sagemaker-xgboost-2021-09-02-16-58-32-261/rule-output/ProfilerReport-1630601912/profiler-output/profiler-reports/GPUMemoryIncrease.json

In [44]:
# When done training/tuning the model, deploy an endpoint to SageMaker
import sagemaker
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

-----------------!

In [45]:
import numpy as np

# This function calls the endpoint to get predictions
# from the model and processes the returned data
def predict_multi_class(data, num_class, rows=1000):
    
    assert(num_class >= 2)
    
    num_examples = data.shape[0]
    
    split_array = np.array_split(data, int(num_examples / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    
    # For binary classifiers, predict() returns a single float:
    # the probability of a positive outcome
    # formally, this means the model returns 1
    if num_class == 2:
        return np.fromstring(predictions[1:], sep=',')
    
    # Convert string version of 2D array to Python list of strings
    pred_list = predictions[1:].replace('[','').replace(']','').strip().split(',')
    
    try:
        assert(len(pred_list) == num_examples * num_class)
    except AssertionError:
        print("Something went wrong. Verify that the value of num_class is correct.")
        exit()
    
    # Convert Python list to Numpy array of floats, and reshape to 2D
    return np.array(pred_list, dtype=float).reshape([num_examples,num_class])

In [46]:
import sklearn

# Output the accuracy of the model on the test set
log_predictions = predict_multi_class(test.to_numpy()[:,1:], 4)
predictions = np.argmax(log_predictions, axis=1)
sklearn.metrics.accuracy_score(test.iloc[:,0], predictions)

0.4482758620689655

In [47]:
# Output the confusion matrix for the test set
cm = sklearn.metrics.confusion_matrix(test.iloc[:,0], predictions)
cm

array([[4, 0, 1, 0],
       [2, 6, 0, 2],
       [0, 3, 0, 1],
       [4, 3, 0, 3]])