# Predicting Whether a Breast Cancer Sample is Benign or Malignant

## Learning Objectives:


1. Understand what SageMaker Script Mode is, and how it can be leveraged.
2. Read in data from S3 to SageMaker
3. User prebuilt SageMaker containers to build, train, and deploy customer sklearn model
4. Use batch transform to perform inferences and measure model performance.


## Introduction
This is a breast cancer diagnoses dataset, where, for each sample, the sample is diagnosed as "Benign" or "Malignant". For each sample, a number of features are given as well. The source of the dataset is the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic).

For this model, we will build, train and deploy a [Multi-layer Perceptron](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html) using the sklearn library.



## Setup

In [None]:
# Ensure we have the right version of sagemaker
%pip install sagemaker==2.48.0

In [None]:
# Import required libraries and create necessary clients
import boto3
import matplotlib.pyplot as plt
import pandas
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.s3 import S3Downloader, S3Uploader
import sklearn
import sklearn.metrics as metrics
from sklearn import model_selection
import s3fs

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
BUCKET = sagemaker_session.default_bucket()
PREFIX = "breast_cancer"

## Process the Data

In [None]:
# Download the sample data
S3Downloader.download(
    s3_uri="s3://sagemaker-sample-files/datasets/tabular/breast_cancer/wdbc.csv",
    local_path="data",
    sagemaker_session=sagemaker_session,
)

df_data = pandas.read_csv(
    "data/wdbc.csv",
    names=[
        "id",
        "diagnosis",
        "radius_mean",
        "texture_mean",
        "perimeter_mean",
        "area_mean",
        "smoothness_mean",
        "compactness_mean",
        "concavity_mean",
        "concave points_mean",
        "symmetry_mean",
        "fractal_dimension_mean",
        "radius_se",
        "texture_se",
        "perimeter_se",
        "area_se",
        "smoothness_se",
        "compactness_se",
        "concavity_se",
        "concave points_se",
        "symmetry_se",
        "fractal_dimension_se",
        "radius_worst",
        "texture_worst",
        "perimeter_worst",
        "area_worst",
        "smoothness_worst",
        "compactness_worst",
        "concavity_worst",
        "concave points_worst",
        "symmetry_worst",
        "fractal_dimension_worst",
    ],
)
df_data.head()

In [None]:
# Get the feature names for analysis
features = list(set(df_data.columns) - set(["id", "diagnosis"]))
# One-hot encode the diagnosis column
df_data = pandas.get_dummies(df_data, columns=["diagnosis"])
# Get the data with encoded features. Malignant is now 1, Benign is 0
df_data = df_data.rename(columns={"diagnosis_M": "truth"})
df_data = df_data[features + ["truth"]]

In [None]:
# Preview the feature data frame
df_data.head()

In [None]:
# Split the data into training (70%) and test (30%) sets
train_df, test_df = model_selection.train_test_split(df_data, test_size=0.3)
# Move the truth column to the front of the training data set
train_df = train_df[["truth"] + features]

In [None]:
# Process the test data set
x_test = test_df[features]
y_test = test_df["truth"].tolist()
print(f"The test data has shape {x_test.shape}")

In [None]:
# Copy the training data to s3 so that sagemaker can read it
train_df.to_csv("data/train_data.csv", index=False)
training_data_path = S3Uploader.upload(
    local_path="data/train_data.csv",
    desired_s3_uri=f"s3://{BUCKET}/{PREFIX}",
    sagemaker_session=sagemaker_session,
)

# Do the same for the test data
x_test.to_csv("data/x_test.csv", index=False, header=False)
test_data_path = S3Uploader.upload(
    local_path="data/x_test.csv",
    desired_s3_uri=f"s3://{BUCKET}/{PREFIX}",
    sagemaker_session=sagemaker_session,
)

## Train the Model

In [None]:
# Define a training script
%%writefile train.py

import argparse
import joblib
import numpy as np
import os
import pandas as pd
from sklearn.neural_network import MLPClassifier

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR"))
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    args = parser.parse_args()

    # Take the set of files and read them all into a single pandas dataframe
    train_data=pd.read_csv(os.path.join(args.train, "train_data.csv"))

    # Extract the labels from the first column
    train_y = train_data["truth"]
    train_X = train_data[train_data.columns[1:len(train_data)]]

    # Use scikit-learn's MLP Classifier to train the model.
    regr = MLPClassifier(random_state=1, max_iter=500).fit(train_X, train_y)
    regr.get_params()

    # Print the coefficients of the trained classifier, and save the coefficients
    joblib.dump(regr, os.path.join(args.model_dir, "model.joblib"))

def model_fn(model_dir):
    """Deserialized and return fitted model

    Note that this should have the same name as the serialized model in the main method
    """
    regr = joblib.load(os.path.join(model_dir, "model.joblib"))
    return regr

def predict_fn(input_data, model):
    """return the class and the probability of the class"""
    prediction = model.predict(input_data)
    pred_prob = model.predict_proba(input_data) # A numpy array
    return np.array(pred_prob)

In [None]:
# Create the estimator
sklearn = SKLearn(
    entry_point="train.py",
    instance_type="ml.c4.xlarge",
    role=role,
    py_version="py3",
    framework_version="0.23-1",
    sagemaker_session=sagemaker_session,
)

In [None]:
# Kick off the training job
sklearn.fit({"train": training_data_path})

## Make Batch Predictions

In [None]:
# Set up a batch transformer for predictions
transformer = sklearn.transformer(
    instance_count=1, instance_type="ml.m4.xlarge", accept="text/csv"
)

In [None]:
# Start a transform job and wait for it to finish
batch_input_s3 = test_data_path
transformer.transform(batch_input_s3, content_type="text/csv", split_type="Line")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()

In [None]:
# Download the output data from S3 to local filesystem
batch_output = transformer.output_path
print(f"Batch transform results saved to {batch_output}")
S3Downloader.download(
    s3_uri=batch_output,
    local_path="data/output",
    sagemaker_session=sagemaker_session,
)

In [None]:
# Preview the batch transform results
!head data/output/*

In [None]:
# Load the predictions and measure performance
predictions = pandas.read_csv("data/output/x_test.csv.out", header=None)
predictions.reset_index(drop=True, inplace=True)
results = pandas.concat([predictions, pandas.Series(y_test)], axis=1)
results.columns = ["pred_0", "pred_1", "true"]
results["true"] = results["true"].astype(int)

In [None]:
# Plot the AUC-ROC curve
fpr, tpr, threshold = metrics.roc_curve(results["true"], results["pred_1"])
roc_auc = metrics.auc(fpr, tpr)

plt.title("Receiver Operating Characteristic")
plt.plot(fpr, tpr, "b", label="AUC = %0.2f" % roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

## Closing

In this notebook we used SageMaker script mode to build, train, and deploy a sklearn model.