In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in feature sets
# Outputs correspond to the datasets
X1 = pd.read_csv("on_features_fmf_faster.csv")
X2 = pd.read_csv("on_features_fmf_slower.csv")
y1 = pd.DataFrame(np.ones((len(X1),1)), columns=['fmf_faster'])
y2 = pd.DataFrame(np.zeros((len(X2),1)), columns=['fmf_faster'])

In [3]:
# Not normalizing the features because all of the features are the same "kind" of thing:
# number of lemmas which have x words.

In [4]:
X = pd.concat([X1,X2], ignore_index=True)
y = pd.concat([y1,y2], ignore_index=True)

In [5]:
from sklearn.model_selection import train_test_split

# Split datasets into
# training (60%)
# validation (20%)
# testing (20%)
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=0.25, random_state=1)

In [6]:
# Combine output and features
train = pd.concat([y_train, X_train], axis=1)
val = pd.concat([y_val, X_val], axis=1)
test = pd.concat([y_test, X_test], axis=1)

In [7]:
train.to_csv('train.csv', index=False, header=False)
val.to_csv('validation.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)

In [8]:
import sagemaker, boto3, os
bucket = sagemaker.Session().default_bucket()
prefix = "smt-qfslia-cvc4-fmf-faster"

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/test.csv')).upload_file('test.csv')

In [9]:
region = sagemaker.Session().boto_region_name
role = sagemaker.get_execution_role()

In [10]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

# Configure model

s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

xgb_model.set_hyperparameters(
    objective = 'binary:logistic',
    max_depth = 5,
    eta       = 0.3,
    num_round = 125,
    alpha = 0.8,
    subsample = 0.5,
    colsample_bytree = 0.5
)

257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1


In [11]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation.csv"), content_type="csv"
)

In [12]:
# Run the training job to fit the model
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2021-08-20 19:46:08 Starting - Starting the training job...
2021-08-20 19:46:09 Starting - Launching requested ML instancesCreateXgboostReport: InProgress
ProfilerReport-1629488767: InProgress
...
2021-08-20 19:47:04 Starting - Preparing the instances for training.........
2021-08-20 19:48:34 Downloading - Downloading input data...
2021-08-20 19:49:04 Training - Downloading the training image...
2021-08-20 19:49:36 Training - Training image download completed. Training in progress..[34m[2021-08-20 19:49:38.471 ip-10-0-141-191.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost S

In [13]:
# When done training/tuning the model, deploy an endpoint to SageMaker
import sagemaker
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

--------!

In [16]:
import numpy as np

# This function calls the endpoint to get predictions
# from the model and processes the returned data
def predict_multi_class(data, num_class, rows=1000):
    
    assert(num_class >= 2)
    
    num_examples = data.shape[0]
    
    split_array = np.array_split(data, int(num_examples / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    
    # For binary classifiers, predict() returns a single float:
    # the probability of a positive outcome
    # formally, this means the model returns 1
    # in this case, it means CVC4 is expected to be faster with --strings-fmf
    if num_class == 2:
        return np.fromstring(predictions[1:], sep=',')
    
    # Convert string version of 2D array to Python list of strings
    pred_list = predictions[1:].replace('[','').replace(']','').strip().split(',')
    
    try:
        assert(len(pred_list) == num_examples * num_class)
    except AssertionError:
        print("Something went wrong. Verify that the value of num_class is correct.")
        exit()
    
    # Convert Python list to Numpy array of floats, and reshape to 2D
    return np.array(pred_list, dtype=float).reshape([num_examples,num_class])

In [17]:
import sklearn

# Output the accuracy of the model on the test set
log_predictions = predict_multi_class(test.to_numpy()[:,1:], 2) # Probability that CVC4 will be faster with --strings-fmf
predictions = np.where(log_predictions > 0.5, 1, 0)
sklearn.metrics.accuracy_score(test.iloc[:,0], predictions)

0.8955223880597015

In [18]:
sklearn.metrics.confusion_matrix(test.iloc[:,0], predictions)

array([[36,  3],
       [ 4, 24]])

In [19]:
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

2021-08-20 19:53:49     373401 smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/CreateXgboostReport/xgboost_report.html
2021-08-20 19:53:48     173632 smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/CreateXgboostReport/xgboost_report.ipynb
2021-08-20 19:51:48     322349 smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/ProfilerReport-1629488767/profiler-output/profiler-report.html
2021-08-20 19:51:47     168696 smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/ProfilerReport-1629488767/profiler-output/profiler-report.ipynb
2021-08-20 19:51:43        190 smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/ProfilerReport-1629488767/profiler-output/profiler-reports/BatchSize.json
2021-08-20 19:51:43        198 smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboos

In [20]:
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/ProfilerReport-1629488767/profiler-output/profiler-reports/BatchSize.json to ProfilerReport-1629488767/profiler-output/profiler-reports/BatchSize.json
download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/ProfilerReport-1629488767/profiler-output/profiler-reports/CPUBottleneck.json to ProfilerReport-1629488767/profiler-output/profiler-reports/CPUBottleneck.json
download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-fmf-faster/xgboost_model/sagemaker-xgboost-2021-08-20-19-46-07-831/rule-output/ProfilerReport-1629488767/profiler-output/profiler-reports/OverallSystemUsage.json to ProfilerReport-1629488767/profiler-output/profiler-reports/OverallSystemUsage.json
download: s3://sagemaker-us-east-2-736959812641/smt-qfslia-cvc4-fmf-faster/xgboost_model/sag