In this notebook you can get a quick preview of what the outcome when you complete the full notebook for this solution.

Here we are using a pre-trained XGBoost model to make predictions for our test dataset,  and evaluate its accuracy.

You can select Run->Run All from the menu to run all cells in Studio (or Cell->Run All in a SageMaker Notebook Instance).

In [None]:
import sys
sys.path.append('./src/')
from package import config

## Read in the data

In [None]:
import boto3
from zipfile import ZipFile

s3 = boto3.resource('s3')
object = s3.Object(f"{config.SOLUTIONS_S3_BUCKET}-{config.AWS_REGION}",f"{config.SOLUTION_NAME}/data/creditcardfraud.zip")
object.download_file("creditcardfraud.zip")

with ZipFile('creditcardfraud.zip', 'r') as zf:
    zf.extractall()

## Split intro train/test

In [None]:
import numpy as np 
import pandas as pd

data = pd.read_csv('creditcard.csv', delimiter=',')

feature_columns = data.columns[:-1]
label_column = data.columns[-1]

features = data[feature_columns].values.astype('float32')
labels = (data[label_column].values).astype('float32')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.1, random_state=42)

## Set up a predictor, using the demo endpoint, and a pre-trained model

In [None]:
from sagemaker.predictor import csv_serializer, RealTimePredictor

xgb_predictor = RealTimePredictor(endpoint="{}-demo".format(config.SOLUTION_PREFIX),
                          serializer=csv_serializer,
                          deserializer=None,
                          content_type='text/csv')

In [None]:
# Because we have a large test set, we call predict on smaller batches
def predict(current_predictor, data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, current_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

## Make predictions and evaluate accuracy

In [None]:
raw_preds = predict(xgb_predictor, X_test)

In [None]:
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score

# scikit-learn expects 0/1 predictions, so we threshold our raw predictions
y_preds = np.where(raw_preds > 0.5, 1, 0)
print("Balanced accuracy = {}".format(balanced_accuracy_score(y_test, y_preds)))
print("Cohen's Kappa = {}".format(cohen_kappa_score(y_test, y_preds)))