In [None]:
from IPython.display import HTML
HTML('''<h2>IMT 575: Sagemaker trial model</h2>
<b><pre>
    Authors: 
    Aftab Alam
    </pre>
</b> 
<p>Date/Time: <span id="datetime"></span></p><script>var dt = new Date();
document.getElementById("datetime").innerHTML=dt.toLocaleString();</script> </p>''')

In [None]:
# enable flag to how all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
import sys
! conda install -y -c conda-forge ipywidgets
! pip install tldextract
import time

import boto3
import numpy as np
import pandas as pd
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt

import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker import get_execution_role
from sagemaker.session import Session

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [3]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-xgboost-trial'
# customize to your bucket where you have stored the data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)

In [27]:
%%time
# split data and save in s3

import io
import boto3
import random
from sklearn.model_selection import train_test_split

def appendXY(X,Y):
    df = pd.DataFrame(X)
    print(df.columns)
    df.insert(loc=0,column='Y',value=Y)
    return df
   

def data_split(features,output_label,train_file, validation_file, test_file,test_size=0.2, random_state=42):

    X_train, X_test, y_train, y_test = train_test_split( features, output_label, test_size=test_size, random_state=random_state)
    test_df = appendXY(X_test,y_test)
    test_df.to_csv(test_file,header=False,index=False)
    
    X_train, X_test, y_train, y_test = train_test_split( X_train, y_train, test_size=test_size, random_state=random_state)
    train_df = appendXY(X_train,y_train)
    train_df.to_csv(train_file,header=False,index=False)
    
    valid_df = appendXY(X_test,y_test)
    valid_df.to_csv(validation_file,header=False,index=False)


def write_to_s3(fobj, bucket, key):
    return boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)

def upload_to_s3(bucket, channel, filename):
    fobj=open(filename, 'rb')
    key = prefix+'/'+channel
    url = 's3://{}/{}/{}'.format(bucket, key, filename)
    print('Writing to {}'.format(url))
    write_to_s3(fobj, bucket, key)

CPU times: user 20 µs, sys: 1e+03 ns, total: 21 µs
Wall time: 24.1 µs


In [6]:
# download file from S3
FILE_DATA = 'domainsDataSet'
s3 = boto3.client('s3')
s3.download_file(bucket, 'domainsDataSet.csv', FILE_DATA)

In [8]:
# Load the dataset
df_domains = pd.read_csv(FILE_DATA)

df_domains.tail()

Unnamed: 0,domain,domain_type
1275047,8thdeadlysim.com,benign
1275048,amiami.com,benign
1275049,freedirectorywebsites.com,benign
1275050,ghaninia.ir,benign
1275051,gndoqarrd.dj,dga


In [24]:
df_domains.columns

Index(['domain', 'domain_type', 'domain_subdomain', 'Y'], dtype='object')

In [10]:
import tldextract

def extract_domain_subdomain(record):
    domain = record.domain
    ret=''
    try:
        ext = tldextract.extract(domain)
        ret = ext.domain
    except :
        print(record)
    return ret
def get_y(row):
    if row.domain_type.lower()=='dga':
        return 1
    elif row.domain_type.lower()=='benign':
        return 0
    else :
        return 1
    
df_domains.loc[:,'domain_subdomain'] = df_domains.apply(lambda row : extract_domain_subdomain(row), axis=1 )
## 1 for dga and 0 for benign
df_domains.loc[:,'Y'] = df_domains.apply(lambda row : get_y(row), axis=1 )

domain         NaN
domain_type    dga
Name: 308868, dtype: object
domain            NaN
domain_type    benign
Name: 940873, dtype: object


In [11]:
def pad(l, content, width):
        l.extend([content] * (width - len(l)))
        return l
def features_extract(domain):
    ch_list = []
    for ch in domain :
        ch_int = ord(ch)
        ch_list = [*ch_list,ch_int]
    # pad zeros up to 63 length
    return pad(ch_list,0,63)

In [12]:
## tldextract example
url = 'https://www.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcde.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.com'
ext=tldextract.extract(url)
len(features_extract(ext.domain))
ext.domain

'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk'

In [13]:
# features X
X = [features_extract(D) for D in df_domains.domain_subdomain]
df = pd.DataFrame(X)
df.insert(loc=0,column='Y',value=df_domains['Y'])
df.to_csv('dataset.csv',header=False,index=False)


In [None]:
%%time
# split df in training , validation and test and save as file
FILE_TRAIN = 'domainsDataSet.train'
FILE_VALIDATION = 'domainsDataSet.validation'
FILE_TEST = 'domainsDataSet.test'

data_split(features=X,output_label=df_domains['Y'],train_file=FILE_TRAIN, validation_file=FILE_VALIDATION, test_file=FILE_TEST,test_size=0.2, random_state=42)

#upload the files to the S3 bucket
upload_to_s3(bucket, 'train', FILE_TRAIN)
upload_to_s3(bucket, 'validation', FILE_VALIDATION)
upload_to_s3(bucket, 'test', FILE_TEST)


RangeIndex(start=0, stop=63, step=1)


In [None]:
# create input
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation'.format(bucket, prefix), content_type='csv')

In [None]:
# train model
from sagemaker.amazon.amazon_estimator import get_image_uri

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-xgboost-abalone-default'

container = get_image_uri(region, 'xgboost','0.90-1')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

xgb.set_hyperparameters(max_depth=2,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100,
                        eval_metric='auc')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
!pip install pickle-mixin
!pip install xgboost==0.90

In [None]:
import tarfile
import pickle as pkl

path_key = f'{prefix}/output/sagemaker-xgboost-2020-05-07-12-31-15-641/output'
# download the model artifact from AWS S3
s3 = boto3.client('s3')
s3.download_file(bucket, f'{path_key}/model.tar.gz', 'model.tar.gz')

! ls -lrt


#opens the downloaded model artifcat and loads it as 'model' variable
tar = tarfile.open('model.tar.gz')
tar.extractall()
tar.close()
file = open('xgboost-model', 'rb')
model = pkl.loads(file.read())