In [None]:
from IPython.display import HTML
HTML('''<h2>IMT 575: Build DGA model using XGBOOST in Sagemaker</h2>
<b><pre>
    Authors: 
    Aftab Alam
    </pre>
</b> 
<p>Date/Time: <span id="datetime"></span></p><script>var dt = new Date();
document.getElementById("datetime").innerHTML=dt.toLocaleString();</script> </p>''')

In [None]:
# In order to run aws boto3 apis locally, please setup proper role(sagement, lambda execution, cloudwatch ..)
# below code will read credential and that can be passed in boto3 session.
# Thanks to https://gist.github.com/wjimenez5271/defeede8eb4a63afc9d8
def get_profile_credentials(profile_name):
    from configparser import ConfigParser
    from configparser import ParsingError
    from configparser import NoOptionError
    from configparser import NoSectionError
    from os import path
    config = ConfigParser()
    config.read([path.join(path.expanduser("~"),'.aws/credentials')])
    try:
        aws_access_key_id = config.get(profile_name, 'aws_access_key_id')
        aws_secret_access_key = config.get(profile_name, 'aws_secret_access_key')
    except ParsingError:
        print('Error parsing config file')
        raise
    except (NoSectionError, NoOptionError):
        try:
            aws_access_key_id = config.get('default', 'aws_access_key_id')
            aws_secret_access_key = config.get('default', 'aws_secret_access_key')
        except (NoSectionError, NoOptionError):
            print('Unable to find valid AWS credentials')
            raise
    return aws_access_key_id, aws_secret_access_key
aws_access_key_id,aws_secret_access_key = get_profile_credentials('aftabuw')
LOCAL=1

In [None]:
import sys
! conda install -y -c conda-forge ipywidgets
! pip install tldextract


In [None]:
import time
import sys
import os
import boto3
import numpy as np
import pandas as pd
# for extracting domain name
import tldextract

%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt

# Sagemake 
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.predictor import csv_serializer
from sagemaker.tensorflow import TensorFlow

# sklearn
from sklearn.model_selection import train_test_split

# enable flag to how all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Setting up bot3 sessions and client
print(f"Runing locally : {LOCAL==1}")
if(LOCAL==1):
    role='arn:aws:iam::099176660580:role/service-role/AmazonSageMaker-ExecutionRole-20200505T194950'
    boto_session = boto3.Session(aws_access_key_id=aws_access_key_id,
                      aws_secret_access_key=aws_secret_access_key)
    region=boto_session.region_name
    sagemaker_client = boto3.client('sagemaker',aws_access_key_id=aws_access_key_id,
                                    aws_secret_access_key=aws_secret_access_key,
                                    region_name=region
                                   )
    sagemaker_runtime_client = boto3.client('sagemaker-runtime',aws_access_key_id=aws_access_key_id,
                                    aws_secret_access_key=aws_secret_access_key,
                                    region_name=region
                                   )
    sagemaker_session = sagemaker.Session(boto_session=boto_session, 
                               sagemaker_client=sagemaker_client, 
                               sagemaker_runtime_client=sagemaker_runtime_client,)
    bucket = sagemaker_session.default_bucket()
    # change this ARN for role that u need to use for sage maker
else:
    # if running it from sagemake notebook instance
    role = sagemaker.get_execution_role()
    boto_session = boto3.Session()
    region=boto_session.region_name
    sagemaker_client = boto3.client('sagemaker',
                                    region_name=region
                                   )
    sagemaker_runtime_client = boto3.client('sagemaker-runtime',
                                    region_name=region
                                   )
    region = boto3.Session().region_name

    # S3 bucket for saving code and model artifacts.
    # Feel free to specify a different bucket and prefix
    sagemaker_session = sagemaker.Session()
    bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/xgboost_exp5/length'
# customize to your bucket where you have stored the data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)

# Data Clean up and Feature engineering

**Function for spliting data and saving it in s3 as well local file system**

In [None]:
%%time
# split data and save in s3

import io
import boto3
import random
from sklearn.model_selection import train_test_split

def appendXY(X,Y):
    df = pd.DataFrame(Y)
    print(df.shape)
    df.columns = ['Y']
    dfx = pd.DataFrame(X)
    print(dfx.shape)
    return pd.concat([df,dfx],axis=1)
   
#  since we difference number of records from dga families, we need to make sure test and training data contains all type of
# dga data set. Hence we will first spilt data from domain file using family and then and extract features. this will to avoid 
# class imbalance issue in data set

def split_dataframe(data,target='family',test_size=.2):
    """
    input a data frame with target on which data needs to be splited.
    
    """
    # distinct number of class in target
    targets = data[target].unique()
    df_test = pd.DataFrame()
    for t in targets:
        df_test= pd.concat([df_test,data[data[target]==t].sample(frac=test_size)])
    df_train = data[~data.index.isin(df_test.index)]
    
    return df_train,df_test

def save_features_file(features,output_label,file):
    
    y = output_label
    X = features
    df = appendXY(X,y)
    df.to_csv(file,header=False,index=False)


def write_to_s3(fobj, bucket, key):  
    return boto_session.resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)

def upload_to_s3(bucket, channel, filename):
    fobj=open(filename, 'rb')
    key = prefix+'/'+channel
    url = 's3://{}/{}/{}'.format(bucket, key, filename)
    print('Writing to {}'.format(url))
    write_to_s3(fobj, bucket, key)

**Download DGA and benign data set from S3 bucket**

In [None]:
# download file from S3
#FILE_DATA = 'domainsDataSet'
#s3 = boto_session.client('s3')
#s3.download_file(bucket, 'newDataSample.csv', FILE_DATA)

In [None]:
df_benign = pd.read_csv("alexa_cisco_onenpage_7mn_begin_dataset.csv.gz").sample(frac=1,random_state=1122)
df_dga = pd.read_csv("sevenmillons_dga.csv.gz")

In [None]:

df_benign.shape
df_benign.head()
df_dga.shape
df_dga.head()


In [None]:
df_benign.dropna(how='any',inplace=True)
df_dga.dropna(how='any',inplace=True)

In [None]:
%%time
# Add length and number of unique char
df_benign.loc[:,'uniquechar'] = df_benign.apply(lambda row: len(set(row.domain)) , axis=1)
df_benign.loc[:,'length'] = df_benign.apply(lambda row: len(row.domain) , axis=1)

In [None]:
df_benign.reset_index(inplace=True,drop=True)
df_dga.reset_index(inplace=True,drop=True)

In [None]:
df_dga.head()


In [None]:
df_benign.head()

In [None]:
# combine benign and dga data set
df_domains = pd.concat([df_dga[['domain','family','label','uniquechar','length']],
                        df_benign[['domain','family','label','uniquechar','length']]])

In [None]:
df_domains.drop_duplicates(subset=['domain'],keep='first',inplace=True)

In [None]:
df_domains.shape

In [None]:
FILE_DATA = 'domainsDataSet_15mn.csv.gz'
df_domains.to_csv(FILE_DATA,compression="gzip",index=False)
df_domains= pd.read_csv(FILE_DATA)

Load and inspect data

In [None]:
# Load the dataset
#df_domains = pd.read_csv(FILE_DATA)

df_domains.tail()
df_domains.head()
df_domains.groupby('label').agg('count')

Function to extract domain from full domain name

In [None]:
%%time

def extract_domain_subdomain(record):
    domain = record.domainName
    ret=''
    try:
        ext = tldextract.extract(domain)
        ret = ext.domain
    except :
        print(record)
    return ret
def get_y(row):
    if row.label.lower()=='bad':
        return 1
    elif row.label.lower()=='good':
        return 0
    else :
        return 1
    


### Create new columns for domain and dga binary

In [None]:
%%time
df_domains.loc[:,'domain_subdomain'] = df_domains.apply(lambda row : extract_domain_subdomain(row), axis=1 )


In [None]:
%%time
## 1 for dga and 0 for benign
df_domains.loc[:,'Y'] = df_domains.apply(lambda row : get_y(row), axis=1 )

In [None]:
df_domains= df_domains[df_domains.family!='others']
df_domains.head()

In [None]:
df_domains.loc[:,'duplicate']=df_domains.duplicated(subset=['domain'],keep=False)

In [None]:
df_domains[['duplicate','family','label']].groupby(['duplicate']).count()

In [None]:
df_domains[(df_domains.duplicate)]

In [None]:
df_domains[(df_domains.duplicate) & (df_domains.domain=='nv5')]

In [None]:
# lets finds with DGA domain(extract)contains benign data. This is case we need to keep begin and drop DGA.
# using last since begin data set is added at the end
df_domains.drop_duplicates(subset=['domain'],keep="first",inplace=True)

In [None]:
df_domains[['family','label']].groupby(['label']).count()

In [None]:
df_domains[df_domains.domain.isna()]

In [None]:
df_domains.drop(index=2171628,inplace=True)

In [None]:
df_domains.head()
df_domains.reset_index(inplace=True)

In [None]:
# correctly family names
def family(fam):
    d = {'dnscharger':'dnschanger','conficker':'conflicker','dircypt':'dircrypt','goz':'gozi',
        'locy':'locky','nymaim':'nymain','un_js':'unjavascript','alexa':'benign'}
    if fam in d.keys():
        return d[fam]
    else:
        return fam

In [None]:
df_domains.loc[:,'family'] = df_domains.apply(lambda row: family(row.family), axis=1)

In [None]:
df_domains[['family','uniquechar','length']].groupby(['family']).agg(['count','mean','max'])

In [None]:
df = df_domains[['family','uniquechar','length']].groupby(['family']).agg(['count','mean','max'])

In [None]:
df.to_csv("data_distribution.csv")

### function to convert domain into features

In [None]:
VALID_CHARS = 'abcdefghijklmnopqrstuvwxyz0123456789-_.'
LOOKUP_TABLE = None
def pad(l, content, width):
        l.extend([content] * (width - len(l)))
        return l
    
def check_validchar(domain):
    for c in domain.lower():
        if c not in VALID_CHARS:
            return False
    return True

    

def features_extract(domain): 
    
    global VALID_CHARS    
    global LOOKUP_TABLE    
    if not LOOKUP_TABLE:        
        LOOKUP_TABLE = dict()       
        idx = 1
        for c in VALID_CHARS:
            LOOKUP_TABLE[c] = int(idx)            
            idx += int(1)    
    #ds = tldextract.extract(fqdn)    
    #domain = ds.domain 
    #ratio = len(set(domain))/len(domain)
    
    rvalue = list()  
    if len(domain)<=63:
        for c in domain.lower():
            try:
                rvalue.append(LOOKUP_TABLE[c])
            except:
                print(f"Char error out in {domain}: {c}")
    else: 
        #print(domain)
        pass
            
    rvalue=pad(rvalue,0,63)    
    return rvalue


In [None]:
%%time
df_temp = df_domains.head(10)
x = [features_extract(D) for D in df_temp.domain]

In [None]:
x

In [None]:
%%time
# check if domain is valid
df_domains.loc[:,'valid'] = df_domains.apply(lambda row : check_validchar(row.domain), axis=1 )


In [None]:
df_domains=df_domains[df_domains.valid]

In [None]:
# Drop invalid data with invalid char
df_domains.shape
df_domains=df_domains[df_domains.valid].copy()
df_domains.shape

In [None]:
%%time
# split data in train,validation and train
df_train_valid,df_test = split_dataframe(data=df_domains,test_size=.1)
df_train,df_valid = split_dataframe(data=df_train_valid,test_size=.1)

In [None]:
df_train.head()
df_valid.head()
df_test.head()

In [None]:
df_valid.family.unique()
df_train.family.unique()
df_test.family.unique()

### Create feature Vector for train, validation and test

In [None]:
%%time
# features X train vector
X_train = [features_extract(D) for D in df_train_valid.domain]
X_train = np.array(X_train)
X_train.shape


In [None]:
%%time
# features X valid vector
X_valid = [features_extract(D) for D in df_valid.domain]
X_valid = np.array(X_valid)
X_valid.shape

In [None]:
%%time
# features X test vector
X_test = [features_extract(D) for D in df_test.domain]
X_test = np.array(X_test)
X_test.shape

Split data feature set and create files that can be used for sagemaker model

In [None]:
%%time
# Combine and X and Y and save into files
FILE_TRAIN = 'domainsDataSet.train'
FILE_VALIDATION = 'domainsDataSet.validation'
FILE_TEST = 'domainsDataSet.test'

save_features_file(features=X_train,output_label=df_train['Y'].values,file=FILE_TRAIN)
save_features_file(features=X_valid,output_label=df_valid['Y'].values,file=FILE_VALIDATION)
save_features_file(features=X_test,output_label=df_test['Y'].values,file=FILE_TEST)

#upload the files to the S3 bucket
upload_to_s3(bucket, 'train', FILE_TRAIN)
upload_to_s3(bucket, 'validation', FILE_VALIDATION)
upload_to_s3(bucket, 'test', FILE_TEST)


In [None]:
# check fields()
from sys import platform
print("Running for platforr: ",platform)
if 'win' in platform:
    print("Validation data")
    !powershell -command "& {Get-Content domainsDataSet.validation -TotalCount 2}"
    print("test data")
    !powershell -command "& {Get-Content domainsDataSet.test -TotalCount 2}"
    print("train data")
    !powershell -command "& {Get-Content 'domainsDataSet.train' -TotalCount 2}"
else:
    print("Validation data")
    !head -3 domainsDataSet.validation
    print("test data")
    !head -3 domainsDataSet.test
    print("train data")
    !head -3 domainsDataSet.train
        

In [None]:
# if train and test data is already saved repeat from here.

### Create Models
Here we will try xgboost and lstm model and tune it for the best performance

#### Set input

In [None]:
# create input
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation'.format(bucket, prefix), content_type='csv')

#### XGBOOST classification

__Reading refernces__:  
https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd  
https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_abalone.ipynb  
https://aws.amazon.com/blogs/machine-learning/call-an-amazon-sagemaker-model-endpoint-using-amazon-api-gateway-and-aws-lambda/

In [None]:
%%time
# train model
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(region, 'xgboost','1.0-1')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.c5.4xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)

# fitting model with paramter from previously best tune model for this data set
# or we can start default and tune model later 
xgb.set_hyperparameters(base_score=0.5, 
                        booster='gbtree', #['gbtree', 'gblinear', 'dart']
                        colsample_bylevel=0.3328968814794882,
                        colsample_bynode=1, 
                        colsample_bytree=0.7460086251908613, 
                        gamma=4.36472704596215, 
                        #reg_lambda=18.34813124562997,
                        alpha=458.20153739471834,
                        max_delta_step=8, max_depth=6,
                        min_child_weight=7.4485695445680005,
                        scale_pos_weight=1, subsample=.9, tree_method='auto',
                        eta=0.4008765966370876,
                        silent=1,
                        objective='reg:squarederror', #reg:squarederror
                        num_round=200
                       )

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})
## Deploy trained XGBoost model endpoint to perform predictions
xgb_predictor = xgb.deploy(initial_instance_count = 1, instance_type = 'ml.t2.medium')

# make sure to set content type to csv as we have data in csv format
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

## Function to chunk down test set into smaller increments

def predict(data, model, rows=500):
    split_array = np.array_split(data, int(len(data) / float(rows) + 1))
    predictions = ''
    for array in split_array:
        #print(array[0])
        predictions = ','.join([predictions, model.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

%%time
## Generate predictions on the test set for the difference models
with open('domainsDataSet.test', 'r') as f:
    payload = f.read().strip()
labels = [int(line[0]) for line in payload.split('\n')]
test_data = [line[2:] for line in payload.split('\n')]
predictions = predict(test_data, xgb_predictor)
#xgb_predictor.predict(payload[2:]).decode('utf-8')
from sklearn.metrics import accuracy_score ,confusion_matrix
thresh = 0.5
y_pred = predictions
y_pred_binary = np.where(predictions > thresh, 1, 0)
accuracy_score(labels,y_pred_binary)
confusion_matrix(labels,y_pred_binary)


In [None]:
!pip install pickle-mixin
!pip install xgboost==1.0.1

In [None]:
import tarfile
import pickle as pkl
#sagemaker/xgboost_exp4/output/sagemaker-xgboost-2020-05-17-09-23-49-951/output
path_key = f'{prefix}/output/sagemaker-xgboost-2020-05-27-15-30-45-659/output'
# download the model artifact from AWS S3
s3 = boto_session.client('s3')
s3.download_file(bucket, f'{path_key}/model.tar.gz', 'model.tar.gz')
#opens the downloaded model artifcat and loads it as 'model' variable
tar = tarfile.open('model.tar.gz')
tar.extractall()
tar.close()
file = open('xgboost-model', 'rb')
model = pkl.loads(file.read())

# list directory and check if model file is present
if 'win' in platform:
    ! dir
else:
    ! ls -lrt




In [None]:
# take one records from test file
if 'win' in platform:
    !powershell -command "& {Get-Content domainsDataSet.test -TotalCount 1}" > single.test
else:
    !head -1 domainsDataSet.test > single.test

In [None]:
import xgboost
with open('single.test', 'r') as f:
    payload = f.read().strip()
    print(payload)
    dtrain = xgboost.DMatrix(payload[2:], label=payload[0])
model.predict(dtrain)

In [None]:
import xgboost
#map_names = dict(zip(model.feature_names, df_domains.columns))
#model.feature_names = list(map_names.values())

#plot feature importance
fig, ax = plt.subplots(figsize=(12,12))
xgboost.plot_importance(model, importance_type='gain', max_num_features=30, height=0.8, ax=ax, show_values = False)
plt.title('Feature Importance')
plt.show()

#### Deploy the model

In [None]:
## Deploy trained XGBoost model endpoint to perform predictions
xgb_predictor = xgb.deploy(initial_instance_count = 1, instance_type = 'ml.t2.medium')


In [None]:
# make sure to set content type to csv as we have data in csv format
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

## Function to chunk down test set into smaller increments

def predict(data, model, rows=500):
    split_array = np.array_split(data, int(len(data) / float(rows) + 1))
    predictions = ''
    for array in split_array:
        #print(array[0])
        predictions = ','.join([predictions, model.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

%%time
## Generate predictions on the test set for the difference models
with open('domainsDataSet.test', 'r') as f:
    payload = f.read().strip()
labels = [int(line[0]) for line in payload.split('\n')]
test_data = [line[2:] for line in payload.split('\n')]
predictions = predict(test_data, xgb_predictor)
#xgb_predictor.predict(payload[2:]).decode('utf-8')
from sklearn.metrics import accuracy_score ,confusion_matrix
thresh = 0.5
y_pred = predictions
y_pred_binary = np.where(predictions > thresh, 1, 0)
accuracy_score(labels,y_pred_binary)
confusion_matrix(labels,y_pred_binary)


#### Test Model using test data from local file

In [None]:
## Function to chunk down test set into smaller increments

def predict(data, model, rows=500):
    split_array = np.array_split(data, int(len(data) / float(rows) + 1))
    predictions = ''
    for array in split_array:
        #print(array[0])
        predictions = ','.join([predictions, model.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

%%time
## Generate predictions on the test set for the difference models
with open('domainsDataSet.test', 'r') as f:
    payload = f.read().strip()
labels = [int(line[0]) for line in payload.split('\n')]
test_data = [line[2:] for line in payload.split('\n')]
predictions = predict(test_data, xgb_predictor)
#xgb_predictor.predict(payload[2:]).decode('utf-8')
from sklearn.metrics import accuracy_score ,confusion_matrix
thresh = 0.5
y_pred = predictions
y_pred_binary = np.where(predictions > thresh, 1, 0)
accuracy_score(labels,y_pred_binary)
confusion_matrix(labels,y_pred_binary)

In [None]:


## Generate predictions on the test set for the difference models
with open('single.test', 'r') as f:
    payload = f.read().strip()
labels = [int(line[0]) for line in payload.split('\n')]
test_data = [line[2:] for line in payload.split('\n')]
predictions = predict(test_data, xgb_predictor)
#xgb_predictor.predict(payload[2:]).decode('utf-8')
predictions



In [None]:
# take one records from test file
if 'win' in platform:
    !powershell -command "& {Get-Content domainsDataSet.test -TotalCount 10}" > ten_records.test
else:
    !head -10 domainsDataSet.test > ten_records.test

In [None]:

## Generate predictions on the test set for the difference models
with open('ten_records.test', 'r') as f:
    payload = f.read().strip()
labels = [int(line[0]) for line in payload.split('\n')]
test_data = [line[2:] for line in payload.split('\n')]
predictions = predict(test_data, xgb_predictor)
#xgb_predictor.predict(payload[2:]).decode('utf-8')
predictions

Test is all test data

In [None]:
%%time
## Generate predictions on the test set for the difference models
with open('domainsDataSet.test', 'r') as f:
    payload = f.read().strip()
labels = [int(line[0]) for line in payload.split('\n')]
test_data = [line[2:] for line in payload.split('\n')]
predictions = predict(test_data, xgb_predictor)
#xgb_predictor.predict(payload[2:]).decode('utf-8')
from sklearn.metrics import accuracy_score ,confusion_matrix
thresh = 0.5
y_pred = predictions
y_pred_binary = np.where(predictions > thresh, 1, 0)
accuracy_score(labels,y_pred_binary)
confusion_matrix(labels,y_pred_binary)

#### Create function to test using sagemaker runtime client

In [None]:
import os
import boto3
import json
import tldextract

# grab environment variables
ENDPOINT_NAME = os.getenv('ENDPOINT_NAME','sagemaker-predict-endpoint')
runtime= boto_session.client('runtime.sagemaker')

def extract_domain(record):
    domain = record
    ret=''
    try:
        ext = tldextract.extract(domain)
        ret = ext.domain
    except :
        print(record)
    return ret

VALID_CHARS = 'abcdefghijklmnopqrstuvwxyz0123456789-_.'
LOOKUP_TABLE = None
def pad(l, content, width):
        l.extend([content] * (width - len(l)))
        return l

def features(domain): 
    
    global VALID_CHARS    
    global LOOKUP_TABLE    
    if not LOOKUP_TABLE:        
        LOOKUP_TABLE = dict()       
        idx = 1
        for c in VALID_CHARS:
            LOOKUP_TABLE[c] = int(idx)            
            idx += int(1) 
    ratio = len(set(domain))/len(domain)
    rvalue = list()  
    if len(domain)<=63 and ' ' not in domain:
        for c in domain.lower():        
            rvalue.append(str(LOOKUP_TABLE[c]))
    else: 
        #print(domain)
        pass
            
    rvalue=pad(rvalue,'0',63)
    rvalue1=','.join(rvalue) + str(ratio) #
    return rvalue1
def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))
    
    data = json.loads(json.dumps(event))
    payload = data['fqdn']
    #print(payload)
    # extract domain 
    
    domain = extract_domain(payload)
    feature_X = features(domain)
    #print(feature_X)
    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                       ContentType='text/csv',
                                       Body=feature_X)
    #print(response)
    pred = json.loads(response['Body'].read().decode())
    print(pred)
    predicted_label = 'dga' if pred > .5 else 'benign'
    
    return predicted_label

In [None]:
event= {'fqdn':'www.google.com'}
context=None
lambda_handler(event,context)

In [None]:
df_feedback = pd.read_csv('feedback.csv')
df_feedback.head(5)
df_feedback.columns= ['domain','correct']
df_feedback.groupby(['correct']).count()
df_feedback_test = df_feedback[(df_feedback.correct == 'dga') | (df_feedback.correct =='benign')].copy()
df_feedback_test.head()

In [None]:
df_feedback_test.loc[:,'TesT'] = df_feedback_test.apply(lambda row: lambda_handler({'fqdn':row['domain']},None), axis=1)

In [None]:
from sklearn.metrics import accuracy_score ,confusion_matrix
df_feedback_test.columns
labels = df_feedback_test.correct
preds = df_feedback_test.TesT
accuracy_score(labels,preds)
confusion_matrix(labels,preds)

In [None]:
df_feedback[df_feedback.correct!=df_feedback.TesT]

In [None]:
# we use the Hyperparameter Tuner
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import HyperparameterTuner

# Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
    'colsample_bylevel': ContinuousParameter(0.1, 1,scaling_type="Logarithmic"),
    'colsample_bytree': ContinuousParameter(0.5, 1, scaling_type='Logarithmic'),
    'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
    'gamma':ContinuousParameter(0, 5, scaling_type='Auto'),
    'lambda': ContinuousParameter(0,100,scaling_type='Auto'),
    'max_delta_step': IntegerParameter(0,10,scaling_type='Auto'),
    'max_depth': IntegerParameter(0,10,scaling_type='Auto'),
    'min_child_weight': ContinuousParameter(0,10,scaling_type='Auto'),
    'num_round': IntegerParameter(1000,3000,scaling_type='Auto'),
    'subsample': ContinuousParameter(0.5,1,scaling_type='Logarithmic')}

objective_metric_name = 'validation:accuracy'

tuner_log = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=1,
    strategy='Bayesian'
)

## Starts the hyperparameter tuning job
tuner_log.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata=False)

## Prints the status of the latest hyperparameter tuning job
boto_session.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

In [None]:
status = sagemaker_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(600)
    status =  sagemaker_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']
    print(status)

In [None]:
# get the best performance model job name from console and create endpoint and then predict using test data

In [None]:
%%time
# Import model for hosting
container = get_image_uri(region, 'xgboost','1.0-1')
from time import gmtime, strftime
#job_name='sagemaker-xgboost-200508-0001-004-0bb812be'
job_name=tuner_log.latest_tuning_job.job_name
model_name=job_name + '-model'
print(model_name)

info = sagemaker_client.describe_training_job(TrainingJobName=job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = sagemaker_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

In [None]:
# create endpoint configuration
from time import gmtime, strftime

endpoint_config_name = 'XGBoostEndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sagemaker_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':"ml.m5.large",
        'InitialVariantWeight':1,
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

In [None]:
%%time
# create end point

import time

endpoint_name = 'XGBoostEndpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = sagemaker_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

resp = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
while status=='Creating':
    print("Status: " + status)
    time.sleep(60)
    resp = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp['EndpointStatus']

print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

In [None]:
df_feedback = pd.read_csv('model-2.csv',header=None)
df_feedback.head(5)
df_feedback.columns= ['domain','correct']

In [None]:
df_feedback.loc[:,'TesT'] = df_feedback.apply(lambda row: lambda_handler({'fqdn':row['domain']},None), axis=1)

In [None]:
from sklearn.metrics import accuracy_score ,confusion_matrix
df_feedback.columns
labels = df_feedback.correct
preds = df_feedback.TesT
accuracy_score(labels,preds)
confusion_matrix(labels,preds)

In [None]:
df_feedback[(df_feedback.correct!=df_feedback.TesT) |(df_feedback.TesT!=df_feedback.correct)]

In [None]:
## Generate predictions on the test set for the difference models
df_test.loc[:,'TesT'] = df_test.apply(lambda row: lambda_handler({'fqdn':row['domain']},None), axis=1)

In [None]:

from sklearn.metrics import accuracy_score ,confusion_matrix
df_feedback.columns
labels = df_test.Y
preds = df_test.TesT
accuracy_score(labels,preds)
confusion_matrix(labels,preds)

In [None]:
df_test.head()

## Training Tensorflow model - LSTM

In [None]:
git_config = {'repo': 'https://github.com/aftabalam01/machinelearningpipeline', 
              'branch': 'staging'}

In [None]:
X_train.shape

In [None]:
data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)

csv_test_dir = os.path.join(os.getcwd(), 'data/csv-test')
os.makedirs(csv_test_dir, exist_ok=True)

Y_train = np.array(df_train_valid['Y'])
Y_test = np.array(df_test['Y'])
np.save(os.path.join(train_dir, 'x_train.npy'), X_train)
np.save(os.path.join(train_dir, 'y_train.npy'), Y_train)
np.save(os.path.join(test_dir, 'x_test.npy'), X_test)
np.save(os.path.join(test_dir, 'y_test.npy'), Y_test)
np.savetxt(os.path.join(csv_test_dir, 'csv-test.csv'), np.array(X_valid, dtype=np.int32), fmt='%d', delimiter=",")

In [None]:
X_train.shape
y_train.shape

In [None]:
Y_train.shape
X_train[2:5]

In [None]:
import sagemaker

s3_prefix = 'tf-keras-dga'

traindata_s3_prefix = '{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '{}/data/test'.format(s3_prefix)

train_s3 = sagemaker_session.upload_data(path='./data/train/', key_prefix=traindata_s3_prefix)
test_s3 = sagemaker_session.upload_data(path='./data/test/', key_prefix=testdata_s3_prefix)

inputs = {'train':train_s3, 'test': test_s3}
print(inputs)

In [None]:
# inputs ={'train': 's3://sagemaker-us-west-2-099176660580/tf-keras-dga/data/train', 'test': 's3://sagemaker-us-west-2-099176660580/tf-keras-dga/data/test'}

In [None]:

%matplotlib inline

import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
import numpy as np
import os

from tensorflow.keras.preprocessing import sequence

max_features = 64
maxlen = 65

# print(len(x_train), 'train sequences')
# print(len(x_test), 'test sequences')

# x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
# x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
# print('x_train shape:', x_train.shape)
# print('x_test shape:', x_test.shape)

In [None]:
from sagemaker.tensorflow import TensorFlow

model_dir = '/opt/ml/model'
train_instance_type = 'ml.c5.4xlarge'
hyperparameters = {'epochs': 10, 'batch_size': 256, 'learning_rate': 0.01,'maxlen':63,'max_features':64}

estimator = TensorFlow(
                       git_config=git_config,
                       source_dir='src/notebooks',
                       entry_point='tf-model.py',
                       model_dir=model_dir,
                       train_instance_type=train_instance_type,
                       train_instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-dga',
                       framework_version='2.1',
                       py_version='py3',
                       script_mode=True)

In [None]:
estimator.fit(inputs)

In [None]:
# Dont forgot to delete end point
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)

In [None]:
sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)