# Cancer Detector

This is an example project using a computer vision model which predicts cancer in biopsy images.

In [1]:
import torchvision
import numpy as np
import pandas as pd
import random
import uuid
from datetime import datetime, timedelta
import pytz
import json
import boto3
import zipfile

# arthur imports
from arthurai import ArthurAI
from arthurai.common.constants import InputType, OutputType, Stage, ValueType, Enrichment

## Create Arthur Connection

In [2]:
# connect to Arthur
# UNCOMMENT the two lines below and enter your details
arthur = ArthurAI(
    # url="https://app.arthur.ai",  # you can also pass this through the ARTHUR_ENDPOINT_URL environment variable
    # login="<YOUR_USERNAME_OR_EMAIL>",  # you can also pass this through the ARTHUR_LOGIN environment variable
)

## Load Data

In [3]:
from botocore import UNSIGNED
from botocore.client import Config

s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
s3.download_file('s3-bucket-arthur-public', 'sandbox_cv_cancer_model/train.zip', 'data/train.zip')
s3.download_file('s3-bucket-arthur-public', 'sandbox_cv_cancer_model/test.zip', 'data/test.zip')

In [4]:
with zipfile.ZipFile('data/train.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')
with zipfile.ZipFile('data/test.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

In [5]:
# load data set used for training
# includes ground truth ("gt_tumor") and predicted values ("tumor")
# we will use this as the reference set
train_df = pd.read_csv('data/train_meta.csv').drop(['gt_normal', 'normal'], axis = 1)
train_df

In [6]:
# load test data
# does not include predictions
# we will generate predictions and send to Arthur as part of this example
test_df = pd.read_csv('data/test_meta.csv').drop(['gt_normal'], axis = 1)
test_df

## Load Model

Our model has already been trained and saved. The functions `load_image()` and `predict()` handle all logic for getting predictions.

These functions are required for enabling explainability.

In [7]:
import sys
sys.path.append('cancer_model')
from entrypoint import load_image, predict

In [8]:
# grab image path to test functions
image_path = train_df.iloc[0]['patient_image']
image = load_image(image_path)
predict(np.array([image]))

## Onboard Model to Arthur

In [9]:
# define the model schema

# pixel_height and pixel_width should be equal to the image size your model expects
# this can be different than the image size you send in an inference, the "raw_image"
# however the load_image function for explainability needs to handle resizing if raw_image is bigger than what model expects
model_meta = {
    "partner_model_id": f"CancerDetector_FG-{datetime.now().strftime('%Y%m%d%H%M%S')}",
    "display_name": "Cancer Detector",
    "input_type": InputType.Image,
    "output_type": OutputType.Multiclass,
    "pixel_height": 96,
    "pixel_width": 96
}
model = arthur.model(**model_meta)


In [10]:
model.build(train_df,
            ground_truth_column="gt_tumor",
            pred_to_ground_truth_map = {"tumor" : 1},
            positive_predicted_attr = "tumor",
            non_input_columns=["center"])

In [11]:
# update inferred attribute `center` determines which medical center the biopsy was taken
model.get_attribute("center").set(categories=[0, 1, 2, 3, 4, 5])

In [12]:
# ensure everything looks correct
model.review()

In [13]:
model_id = model.save()
with open("fullguide_model_id.txt", "w") as f:
    f.write(model_id)

In [None]:
# you can fetch a model by ID. for example pull the last-created model:
# with open("fullguide_model_id.txt", "r") as f:
#     model_id = f.read()
# model = arthur.get_model(model_id)

## Enable Explainability

To get explanations for inferences, you first must provide us with your model and functions to use it.

`project_directory` should contain your serialized model, predict file, requirements file, and any other files required for generating predictions.

For image models, you must provide a `load_image` function which takes in a path to an image, and returns the image in a `numpy` array, with any resizing or processing logic done.  
`predict` function take in a `numpy` array of processed image data and should wrap your models `predict` function.

See `cancer_model/entrypoint.py` for example functions for this model.

In [14]:
import os
project_dir = os.path.join(os.getcwd(), "cancer_model")

model.enable_explainability(
    project_directory=project_dir,
    user_predict_function_import_path='entrypoint',
    streaming_explainability_enabled=True,
    explanation_algo='lime',
    explanation_nsamples=2000
)

## Send Inferences

Now we can go ahead and send some inference data.

In this example ground truth is supplied with the predicted value, however you can also only include inference data, and then upload ground truth for the inference on a later date.

In [15]:
data_source = test_df
num_to_send = 100
inferences = []

# mimic data sent over the last week
timestamps = pd.date_range(start=datetime.now(pytz.utc) - timedelta(days=7),
                           end=datetime.now(pytz.utc),
                           periods=num_to_send)

for j in range(num_to_send):
    # grab random test record
    i = random.randint(0, len(data_source))
    # load image and predict
    image_path = data_source.iloc[i]['patient_image']
    image = load_image(image_path)
    pred = predict(np.array([image]))
    # build inference data
    inf = {'patient_image': image_path,
           'tumor': pred.item(),
           'gt_tumor': data_source.iloc[i]['gt_tumor'],
           'inference_timestamp': timestamps[j]}
    inferences.append(inf)
    print('Image Path: ', image_path)
    print('Inference: ', json.dumps(inf, indent=2, default=str), '\n')

In [16]:
model.send_inferences(inferences)