# Baseline XGBoost Model
## Training Gradient Boosted trees with node features

In [None]:
from utils import get_data
import os
os.chdir("../")

import pandas as pd
import numpy as np

!bash setup.sh

## Read data and upload to S3

In [None]:
import io
from sklearn.datasets import dump_svmlight_file

train_X, train_y, test_X, test_y = get_data()

buf = io.BytesIO()
dump_svmlight_file(train_X.values[:, 1:], train_y, buf)
buf.seek(0);
filename = 'xgboost-fraud-dataset.libsvm'
with open(filename,'wb') as out:
    out.write(buf.read())

In [None]:
import os
import sagemaker
from sagemaker.s3 import S3Uploader

from sagemaker_graph_fraud_detection import config

role = config.role

session = sagemaker.Session()
bucket = config.solution_bucket
prefix = 'xgboost-fraud-detection'

s3_train_data = S3Uploader.upload(filename, 's3://{}/{}/{}'.format(bucket, prefix,'train'))
print('Uploaded training data location: {}'.format(s3_train_data))

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

## Train SageMaker XGBoost Estimator

In [None]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='0.90-2')
scale_pos_weight = np.sqrt((len(train_y) - sum(train_y))/sum(train_y))

hyperparams = {
        "max_depth":5,
        "subsample":0.8,
        "num_round":100,
        "eta":0.2,
        "gamma":4,
        "min_child_weight":6,
        "silent":0,
        "objective":'binary:logistic',
        "eval_metric":'f1',
        "scale_pos_weight": scale_pos_weight
}

xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    hyperparameters=hyperparams,
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path=output_location,
                                    sagemaker_session=session)
xgb.fit({'train': s3_train_data})

In [None]:
from sagemaker.predictor import csv_serializer

predictor = xgb.deploy(initial_instance_count=1,
                       endpoint_name="xgboost-fraud-endpoint",
                       instance_type='ml.m4.xlarge')

# Specify input and output formats.
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [None]:
def predict(current_predictor, data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, current_predictor.predict(array).decode('utf-8')])
    return np.fromstring(predictions[1:], sep=',')

raw_preds = predict(predictor, test_X.values[:, 1:])
y_preds = np.where(raw_preds > 0.5, 1, 0)

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
from matplotlib import pyplot as plt
%matplotlib inline

def print_metrics(y_true, y_predicted):

    cm  = confusion_matrix(y_true, y_predicted)
    true_neg, false_pos, false_neg, true_pos = cm.ravel()
    cm = pd.DataFrame(np.array([[true_pos, false_pos], [false_neg, true_neg]]),
                                    columns=["labels positive", "labels negative"],
                                    index=["predicted positive", "predicted negative"])
    
    acc = (true_pos + true_neg)/(true_pos + true_neg + false_pos + false_neg)
    precision = true_pos/(true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
    recall = true_pos/(true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
    f1 = 2*(precision*recall)/(precision + recall) if (precision + recall) > 0 else 0
    print("Confusion Matrix:")
    print(pd.DataFrame(cm, columns=["labels positive", "labels negative"], 
                       index=["predicted positive", "predicted negative"]))
    print("f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, acc: {:.4f}".format(f1, precision, recall, acc))
    print()
    
def plot_roc_curve(fpr, tpr, roc_auc):
    f = plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Model ROC curve')
    plt.legend(loc="lower right")

print_metrics(test_y, y_preds)
fpr, tpr, _ = roc_curve(test_y, y_preds)
roc_auc = auc(fpr, tpr)
plot_roc_curve(fpr, tpr, roc_auc)

In [None]:
predictor.delete_endpoint()