In [1]:
from IPython.display import HTML
HTML('''<h2>IMT 575: Sagemaker trial model</h2>
<b><pre>
    Authors: 
    Aftab Alam
    </pre>
</b> 
<p>Date/Time: <span id="datetime"></span></p><script>var dt = new Date();
document.getElementById("datetime").innerHTML=dt.toLocaleString();</script> </p>''')

In [2]:
# enable flag to how all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import sys
! conda install -y -c conda-forge ipywidgets

!{sys.executable} -m pip install sagemaker-experiments
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install torchvision

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [4]:

import time

import boto3
import numpy as np
import pandas as pd
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt

import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.analytics import ExperimentAnalytics

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

Let's start by specifying:

The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting.
The IAM role arn used to give training and hosting access to your data. See the documentation for how to create these. Note, if more than one role is required for notebook instances, training, and/or hosting, please replace the boto regexp with a the appropriate full IAM role arn string(s).

In [5]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-xgboost-abalone-default'
# customize to your bucket where you have stored the data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)


Fetching the dataset
Following methods split the data into train/test/validation datasets and upload files to S3.

In [6]:
%%time

import io
import boto3
import random

def data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST):
    data = [l for l in open(FILE_DATA, 'r')]
    train_file = open(FILE_TRAIN, 'w')
    valid_file = open(FILE_VALIDATION, 'w')
    tests_file = open(FILE_TEST, 'w')

    num_of_data = len(data)
    num_train = int((PERCENT_TRAIN/100.0)*num_of_data)
    num_valid = int((PERCENT_VALIDATION/100.0)*num_of_data)
    num_tests = int((PERCENT_TEST/100.0)*num_of_data)

    data_fractions = [num_train, num_valid, num_tests]
    split_data = [[],[],[]]

    rand_data_ind = 0

    for split_ind, fraction in enumerate(data_fractions):
        for i in range(fraction):
            rand_data_ind = random.randint(0, len(data)-1)
            split_data[split_ind].append(data[rand_data_ind])
            data.pop(rand_data_ind)

    for l in split_data[0]:
        train_file.write(l)

    for l in split_data[1]:
        valid_file.write(l)

    for l in split_data[2]:
        tests_file.write(l)

    train_file.close()
    valid_file.close()
    tests_file.close()

def write_to_s3(fobj, bucket, key):
    return boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)

def upload_to_s3(bucket, channel, filename):
    fobj=open(filename, 'rb')
    key = prefix+'/'+channel
    url = 's3://{}/{}/{}'.format(bucket, key, filename)
    print('Writing to {}'.format(url))
    write_to_s3(fobj, bucket, key)

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 11.9 µs



## Data ingestion
Next, we read the dataset from the existing repository into memory, for preprocessing prior to training. This processing could be done in situ by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as this one, reading into memory isn't onerous, though it would be for larger datasets.

In [7]:
FILE_DATA = 'domainsDataSet'
s3 = boto3.client('s3')
s3.download_file(bucket, 'domainsDataSet.csv', FILE_DATA)




In [8]:
# Load the dataset
df_domains = pd.read_csv(FILE_DATA)

## append data set
dummy_domain = {'abcdefghijklmnopqrstuvwxyz1234567890-':'dga'}

dummy_record = pd.DataFrame(dummy_domain.items(), columns=['domain', 'domain_type'])
df_domains = pd.concat((df_domains,dummy_record), axis=0)
df_domains.reset_index(drop=True,inplace=True)
df_domains.tail()

Unnamed: 0,domain,domain_type
1275048,amiami.com,benign
1275049,freedirectorywebsites.com,benign
1275050,ghaninia.ir,benign
1275051,gndoqarrd.dj,dga
1275052,abcdefghijklmnopqrstuvwxyz1234567890-,dga


In [9]:
! pip install tldextract



In [10]:
import tldextract

def extract_domain_subdomain(record):
    domain = record.domain
    ret=''
    try:
        ext = tldextract.extract(domain)
        ret = ext.domain
    except :
        print(record)
    return ret
def get_y(row):
    if row.domain_type.lower()=='dga':
        return 1
    elif row.domain_type.lower()=='benign':
        return 0
    else :
        return 1
    
df_domains.loc[:,'domain_subdomain'] = df_domains.apply(lambda row : extract_domain_subdomain(row), axis=1 )
## 1 for dga and 0 for benign
df_domains.loc[:,'Y'] = df_domains.apply(lambda row : get_y(row), axis=1 )

domain         NaN
domain_type    dga
Name: 308868, dtype: object
domain            NaN
domain_type    benign
Name: 940873, dtype: object


In [11]:
df_domains.head()

Unnamed: 0,domain,domain_type,domain_subdomain,Y
0,sugaprof.ru,benign,sugaprof,0
1,zastudents.com,benign,zastudents,0
2,www.youvezmcohdi.ml,dga,youvezmcohdi,1
3,izmirgundemi.net,benign,izmirgundemi,0
4,benchlife.info,benign,benchlife,0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = None
X1=None
# adding paramter in TfidfVectorizer to using tf_idf tranformation and limiting max feature to 10000
vectorizer = TfidfVectorizer(strip_accents='ascii',use_idf=True,
                             sublinear_tf=True,smooth_idf=True,
                             binary=False,analyzer='char_wb',
                            max_features = 100,)
X1 = vectorizer.fit_transform(df_domains.domain_subdomain).toarray()
X1.shape

(1275053, 40)

In [24]:
# Join X1 and Y and save is as FILE DATA 

from sklearn.datasets import dump_svmlight_file
dump_svmlight_file(X1,df_domains['Y'],'domainsDataSet')

In [68]:
! head -2 domainsDataSet

0 0:0.2687097608786886 14:0.2813527225280327 19:0.4108709947786495 20:0.3783251901532605 28:0.2933002167525229 29:0.3636606441633448 31:0.3045221187699864 32:0.3047333906188266 34:0.3644455304172354
0 0:0.233468838662212 14:0.2444536929669867 17:0.3025686127643 18:0.2479858552418345 27:0.272886119416648 32:0.4482912232060587 33:0.4509989577176581 34:0.3166489913277059 39:0.3919074444218358


In [None]:
%%time
# SciKit Learn implements several Machine Learning algorithms
import sklearn
print( "Scikit-Learn version: %6.6s (need at least 0.13.1)" %
       sklearn.__version__)
import os

import scipy.stats as stats
import statsmodels.formula.api as smf

In [25]:



#split the downloaded data into train/test/validation files
FILE_DATA = 'domainsDataSet'
FILE_TRAIN = 'domainsDataSet.train'
FILE_VALIDATION = 'domainsDataSet.validation'
FILE_TEST = 'domainsDataSet.test'
PERCENT_TRAIN = 70
PERCENT_VALIDATION = 15
PERCENT_TEST = 15
data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST)

#upload the files to the S3 bucket
upload_to_s3(bucket, 'train', FILE_TRAIN)
upload_to_s3(bucket, 'validation', FILE_VALIDATION)
upload_to_s3(bucket, 'test', FILE_TEST)

Writing to s3://sagemaker-us-west-2-099176660580/sagemaker/DEMO-xgboost-abalone-default/train/domainsDataSet.train
Writing to s3://sagemaker-us-west-2-099176660580/sagemaker/DEMO-xgboost-abalone-default/validation/domainsDataSet.validation
Writing to s3://sagemaker-us-west-2-099176660580/sagemaker/DEMO-xgboost-abalone-default/test/domainsDataSet.test


## Training the XGBoost model¶
After setting training parameters, we kick off training, and poll for status until training is completed

In [26]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost', '0.90-1')



In [27]:
%%time
import boto3
from time import gmtime, strftime

job_name = 'DEMO-xgboost-regression-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

#Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": bucket_path + "/" + prefix + "/single-xgboost"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m5.large",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:squarederror",
        "num_round":"50"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/train',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/validation',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker', region_name=region)
client.create_training_job(**create_training_params)

import time

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)

Training job DEMO-xgboost-regression-2020-05-06-22-05-38
InProgress
InProgress
InProgress
InProgress
InProgress
Completed
CPU times: user 102 ms, sys: 11 µs, total: 102 ms
Wall time: 5min


## Import model into hosting
Register the model with hosting. This allows the flexibility of importing models trained elsewhere.

In [28]:
%%time
import boto3
from time import gmtime, strftime

model_name=job_name + '-model'
print(model_name)

info = client.describe_training_job(TrainingJobName=job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

DEMO-xgboost-regression-2020-05-06-22-05-38-model
https://s3-us-west-2.amazonaws.com/sagemaker-us-west-2-099176660580/sagemaker/DEMO-xgboost-abalone-default/single-xgboost/DEMO-xgboost-regression-2020-05-06-22-05-38/output/model.tar.gz
arn:aws:sagemaker:us-west-2:099176660580:model/demo-xgboost-regression-2020-05-06-22-05-38-model
CPU times: user 22 ms, sys: 89 µs, total: 22.1 ms
Wall time: 483 ms



## Create endpoint configuration
SageMaker supports configuring REST endpoints in hosting with multiple models, e.g. for A/B testing purposes. In order to support this, customers create an endpoint configuration, that describes the distribution of traffic across the models, whether split, shadowed, or sampled in some way. In addition, the endpoint configuration describes the instance type required for model deployment.

In [32]:
from time import gmtime, strftime

endpoint_config_name = 'DEMO-XGBoostEndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':"ml.m5.large",
        'InitialVariantWeight':1,
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

DEMO-XGBoostEndpointConfig-2020-05-06-22-19-20
Endpoint Config Arn: arn:aws:sagemaker:us-west-2:099176660580:endpoint-config/demo-xgboostendpointconfig-2020-05-06-22-19-20


## Create endpoint
Lastly, the customer creates the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 9-11 minutes to complete.

In [33]:
%%time
import time

endpoint_name = 'DEMO-XGBoostEndpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

resp = client.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
while status=='Creating':
    print("Status: " + status)
    time.sleep(60)
    resp = client.describe_endpoint(EndpointName=endpoint_name)
    status = resp['EndpointStatus']

print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

DEMO-XGBoostEndpoint-2020-05-06-22-19-23
arn:aws:sagemaker:us-west-2:099176660580:endpoint/demo-xgboostendpoint-2020-05-06-22-19-23
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Arn: arn:aws:sagemaker:us-west-2:099176660580:endpoint/demo-xgboostendpoint-2020-05-06-22-19-23
Status: InService
CPU times: user 119 ms, sys: 8.13 ms, total: 127 ms
Wall time: 8min 1s


## Validate the model for use
Finally, the customer can now validate the model for use. They can obtain the endpoint from the client library using the result from previous operations, and generate classifications from the trained model using that endpoint.

In [37]:
runtime_client = boto3.client('runtime.sagemaker', region_name=region)

!head -1 domainsDataSet.test > single.test

In [38]:
!cat single.test

1 0:0.1530602343256397 3:0.384467500067292 7:0.3795057983107694 10:0.3915020372984007 12:0.6622415491334985 18:0.1625774699966537 25:0.1896536648352031 33:0.1746281597143548


In [41]:
%%time
import json
from itertools import islice
import math
import struct

file_name = 'single.test' #customize to your test file
with open(file_name, 'r') as f:
    payload = f.read().strip()
response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='text/x-libsvm', 
                                   Body=payload)
result = response['Body'].read()
result = result.decode("utf-8")
result = result.split(',')
result = [math.ceil(float(i)) for i in result]
label = payload.strip(' ').split()[0]
print ('Label: ',label,'\nPrediction: ', result[0])

Label:  1 
Prediction:  1
CPU times: user 13.5 ms, sys: 0 ns, total: 13.5 ms
Wall time: 159 ms


In [62]:
!head -100 domainsDataSet.test > multi.test

In [58]:
import sys
import math
def do_predict(data, endpoint_name, content_type):
    payload = '\n'.join(data)
    response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType=content_type, 
                                   Body=payload)
    result = response['Body'].read()
    result = result.decode("utf-8")
    result = result.split(',')
    preds = [float((num)) for num in result]
    preds = [math.ceil(num) for num in preds]
    return preds

def batch_predict(data, batch_size, endpoint_name, content_type):
    items = len(data)
    arrs = []
    sys.stdout.write(str(items))
    for offset in range(0, items, batch_size):
        if offset+batch_size < items:
            results = do_predict(data[offset:(offset+batch_size)], endpoint_name, content_type)
            arrs.extend(results)

        else:
            print(len(data[offset:items]))
            sys.stdout.write("last\n")
            sys.stdout.write(str(len(arrs)))
            arrs.extend(do_predict(data[offset:items], endpoint_name, content_type))
            sys.stdout.write("last\n")
            sys.stdout.write(str(len(arrs)))

    return(arrs)

In [63]:
%%time
import json
import numpy as np

with open('multi.test', 'r') as f:
    payload = f.read().strip()

labels = [int(line.split(' ')[0]) for line in payload.split('\n')]
test_data = [line for line in payload.split('\n')]
preds = batch_predict(test_data, 100, endpoint_name, 'text/x-libsvm')

print('\n Median Absolute Percent Error (MdAPE) = ', np.median(np.abs(np.array(labels) - np.array(preds)) / np.array(labels)))

100100
last
0last
100
 Median Absolute Percent Error (MdAPE) =  nan
CPU times: user 8.17 ms, sys: 5 µs, total: 8.18 ms
Wall time: 31.3 ms


  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


In [69]:
len(test_data)
from sklearn.metrics import accuracy_score ,confusion_matrix
accuracy_score(labels,preds)
confusion_matrix(labels,preds)


100

0.49

array([[12, 36,  0],
       [ 0, 37, 15],
       [ 0,  0,  0]])

In [70]:
client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '73f373e1-0677-4262-af54-0d790dcbe666',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '73f373e1-0677-4262-af54-0d790dcbe666',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Wed, 06 May 2020 23:13:32 GMT'},
  'RetryAttempts': 0}}