In [2]:
from arthurai import ArthurAI
from arthurai.common.constants import InputType, OutputType, Stage, ValueType
from arthurai.core.attributes import AttributeCategory
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import pytz

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

## Set up connection to API

In [3]:
# credentials are being passed to the client via environment variables
connection = ArthurAI()



## Load data

In [4]:
# train_data_full contains the X's and the Y's
meps_data = pd.read_csv("data/meps_data.csv")

# quick summary of what meps_data looks like
meps_data.head()

Unnamed: 0,AGE,RACE,PCS42,MCS42,K6SUM42,UTILIZATION,REGION,SEX,MARRY,FTSTU,...,ACTLIM,SOCLIM,COGLIM,DFHEAR42,DFSEE42,ADSMOK42,PHQ242,EMPST,POVCAT,INSCOV
0,44,1,57.76,57.06,1,1,1,2,1,-1,...,2,2,2,2,2,2,0,1,5,1
1,43,1,59.11,54.1,3,0,1,1,1,-1,...,2,2,2,2,2,2,0,1,5,1
2,16,1,-1.0,-1.0,-1,0,1,1,5,-1,...,2,2,-1,2,2,-1,-1,1,5,1
3,13,1,-1.0,-1.0,-1,0,1,1,6,-1,...,2,2,-1,2,2,-1,-1,-1,5,1
4,66,0,22.45,53.13,0,1,1,1,3,-1,...,1,1,1,2,1,1,0,4,1,2


In [5]:
# split data into train and test, undersampling test data to knock performance
meps_train, meps_test = train_test_split(meps_data, train_size=0.95, random_state=77)

# split data into X and y
X_train, y_train = meps_train.drop(columns=['UTILIZATION', 'RACE']), meps_train['UTILIZATION']
X_test, y_test = meps_test.drop(columns=['UTILIZATION', 'RACE']), meps_test['UTILIZATION']

## Fit scikit-learn model

In [6]:
# fit model
sk_model = RandomForestClassifier()
sk_model.fit(X_train, y_train)

# save model
joblib.dump(sk_model, "./saved_model/skl_rf.joblib")
# print test accuracy
print(f"Overall Test Accuracy: {(100*accuracy_score(y_test, sk_model.predict(X_test))):.1f}%")

Overall Test Accuracy: 86.0%


## Set up ArthurModel
Everything under this header is *necessary* for an `ArthurModel` to be created; additional functionality is not 
possible until after `model.save()` has been successfully called.

*Some context about this dataset:*

- label: `UTILIZATION`, where 1: >10 visits, 0: <10 visits

- protected attribute: `RACE`, where 1: `White`, 0: `Non-White`

In [29]:
# uncomment the below if you want to get the same model object that you have already created
# arthur_model = connection.get_model('<YOUR MODEL ID>')

In [7]:
# Set up ground truth and prediction columns
meps_train = meps_train.rename(columns={'UTILIZATION': "high_utilization"})
# add prediction column to meps_train
meps_train['pred_high_utilization'] = sk_model.predict_proba(X_train)[:,1]
meps_train

Unnamed: 0,AGE,RACE,PCS42,MCS42,K6SUM42,high_utilization,REGION,SEX,MARRY,FTSTU,...,SOCLIM,COGLIM,DFHEAR42,DFSEE42,ADSMOK42,PHQ242,EMPST,POVCAT,INSCOV,pred_high_utilization
14230,10,0,-1.00,-1.00,-1,0,3,1,6,-1,...,2,-1,2,2,-1,-1,-1,4,2,0.000000
7611,25,0,42.80,62.26,0,0,4,2,5,-1,...,2,2,2,2,2,0,1,5,3,0.010000
2359,69,1,54.53,57.73,0,1,4,2,3,-1,...,2,2,2,1,2,0,4,3,2,0.740000
2311,68,1,41.59,65.38,0,1,2,1,5,-1,...,2,2,2,2,2,0,4,4,1,0.800000
15308,22,0,56.48,47.77,0,0,2,1,5,3,...,2,2,2,2,2,0,1,4,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2283,55,0,-1.00,-1.00,-1,0,3,1,1,-1,...,2,2,2,2,-1,-1,1,4,2,0.000000
10196,60,0,56.15,57.16,0,0,3,2,5,-1,...,2,2,2,2,2,0,1,4,1,0.010000
11860,16,1,-1.00,-1.00,-1,0,1,2,5,-1,...,2,-1,2,2,-1,-1,4,4,1,0.324583
8799,75,0,41.04,66.81,0,0,3,2,1,-1,...,2,2,2,2,2,0,4,3,2,0.050000


In [9]:
arthur_model = connection.model(partner_model_id=f"MedicalUtilization_FG-{datetime.now().strftime('%Y%m%d%H%M%S')}",
                                display_name="MEPS Medical Utilization Explainability",
                                input_type=InputType.Tabular,
                                output_type=OutputType.Multiclass)



In [10]:
arthur_model.build(meps_train, non_input_columns=['RACE'],
                   ground_truth_column="high_utilization",
                   pred_to_ground_truth_map={"pred_high_utilization": 1})


2022-07-20 13:26:38,598 - arthurai.core.models - INFO - Please review the inferred schema. If everything looks correct, lock in your model by calling arthur_model.save()


Unnamed: 0,name,stage,value_type,categorical,is_unique,categories,bins,range,monitor_for_bias
0,AGE,PIPELINE_INPUT,INTEGER,False,False,[],,"[0, 85]",False
1,PCS42,PIPELINE_INPUT,FLOAT,False,False,[],,"[-9.0, 70.51]",False
2,MCS42,PIPELINE_INPUT,FLOAT,False,False,[],,"[-9.0, 75.64]",False
3,K6SUM42,PIPELINE_INPUT,INTEGER,False,False,[],,"[-9, 24]",False
4,REGION,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4}]",,"[None, None]",False
5,SEX,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}]",,"[None, None]",False
6,MARRY,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4...",,"[None, None]",False
7,FTSTU,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: -1}]",,"[None, None]",False
8,ACTDTY,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4}]",,"[None, None]",False
9,HONRDC,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4}]",,"[None, None]",False


In [10]:
# update the `RACE` feature to set bias monitoring
arthur_model.get_attribute('RACE').set(monitor_for_bias=True)

ArthurAttribute(name='RACE', value_type='INTEGER', stage='NON_INPUT_DATA', id=None, label=None, position=0, categorical=True, min_range=None, max_range=None, monitor_for_bias=True, categories=[AttributeCategory(value='0', label=None), AttributeCategory(value='1', label=None)], bins=None, is_unique=False, is_positive_predicted_attribute=False, attribute_link=None, gt_class_link=None)

In [11]:
# check all the attributes loaded to the model. note that even though you passed in your "real" training
# data, currently the model has no reference to the actual datapoints -- just the properties of the 
# attributes. you will need to set reference data (for data drift detection) later
arthur_model.review()

Unnamed: 0,name,stage,value_type,categorical,is_unique,categories,bins,range,monitor_for_bias
0,AGE,PIPELINE_INPUT,INTEGER,False,False,[],,"[0, 85]",False
1,PCS42,PIPELINE_INPUT,FLOAT,False,False,[],,"[-9.0, 70.51]",False
2,MCS42,PIPELINE_INPUT,FLOAT,False,False,[],,"[-9.0, 75.64]",False
3,K6SUM42,PIPELINE_INPUT,INTEGER,False,False,[],,"[-9, 24]",False
4,REGION,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4}]",,"[None, None]",False
5,SEX,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}]",,"[None, None]",False
6,MARRY,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4...",,"[None, None]",False
7,FTSTU,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: -1}]",,"[None, None]",False
8,ACTDTY,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4}]",,"[None, None]",False
9,HONRDC,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4}]",,"[None, None]",False


In [12]:
# your model is not uploaded to Arthur until you call model.save(). 
model_id = arthur_model.save()
with open("fullguide_model_id.txt", "w") as f:
    f.write(model_id)

2022-07-20 13:27:05,478 - arthurai.core.data_service - INFO - Starting upload (0.303 MB in 1 files), depending on data size this may take a few minutes
2022-07-20 13:27:07,046 - arthurai.core.data_service - INFO - Upload completed: /var/folders/nz/nl73471j5cl3vy5mnh9108c40000gn/T/tmpotakvcez/3f7d1a36-dd3b-4fb7-9017-a89c33439d74-0.parquet


In [13]:
# you can fetch a model by ID. for example pull the last-created model:
# with open("fullguide_model_id.txt", "r") as f:
#     model_id = f.read()
# arthur_model = connection.get_model(model_id)

## Adding additional functionality

Add to the functionality of the `ArthurModel` that you've created by enabling explainability. 

### Enabling Explainability

In [14]:
import os

arthur_model.enable_explainability(
    df=meps_train,
    project_directory=os.path.abspath(""),
    requirements_file="requirements.txt",
    user_predict_function_import_path="entrypoint",
    ignore_dirs = ['./data'],
    streaming_explainability_enabled=True,
)

Ignoring folder: /Users/sarahostermeier/Code/arthur-sandbox/examples/example_projects/healthcare_utilization
Ignoring folder: /Users/sarahostermeier/Code/arthur-sandbox/examples/example_projects/healthcare_utilization/saved_model
Ignoring folder: /Users/sarahostermeier/Code/arthur-sandbox/examples/example_projects/healthcare_utilization/__pycache__
Ignoring folder: /Users/sarahostermeier/Code/arthur-sandbox/examples/example_projects/healthcare_utilization/.ipynb_checkpoints
Ignoring folder: /Users/sarahostermeier/Code/arthur-sandbox/examples/example_projects/healthcare_utilization/data


2022-07-20 13:27:14,461 - arthurai.explainability.arthur_explainer - INFO - Testing model predict() function on provided data
2022-07-20 13:27:14,812 - arthurai.explainability.arthur_explainer - INFO - Model predict() function test successful


'ok'

## Send inferences to Arthur

Finally we'll make some inferences on the test data and register these with Arthur. First we do the inference:

In [15]:
num_inferences = 350

# create prediction dataframe
pred_df = meps_test.sample(num_inferences).rename(columns={'UTILIZATION': 'high_utilization'})
# predict onto it
preds = sk_model.predict_proba(pred_df.drop(columns=['high_utilization', 'RACE']))
pred_df['pred_high_utilization'] = preds[:, 1]

And send the data to Arthur:

In [16]:
# optional metadata: timestamps over the last week
timestamps = pd.date_range(start=datetime.now(pytz.utc) - timedelta(days=7),
                           end=datetime.now(pytz.utc),
                           periods=num_inferences)

# send inferences to Arthur
arthur_model.send_inferences(pred_df, inference_timestamps=timestamps)

2022-07-20 13:28:05,057 - arthurai.core.models - INFO - 350 rows were missing ground_truth_timestamp fields, so the current time was populated
2022-07-20 13:28:05,058 - arthurai.core.models - INFO - 350 rows were missing partner_inference_id fields, so UUIDs were generated, see return values


{'counts': {'failure': 0, 'success': 350, 'total': 350},
 'results': [{'message': 'success',
   'status': 200,
   'partner_inference_id': 'ULJPu8EAgagzxx8gCNJJZ2'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'jVgVx8PmE9XxFHRU4ekD6c'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'VosCdf5Yy5GiBgBqNxhzsq'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'kCGpsbiHbqc8oTA586Gvkj'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'XLpRPJMcdqtNKFa6ABYQ3z'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'cAP7PBmMViv9FkLTh56Pfm'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'nQwXhwEmBqVWM6Z4yzff7X'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'WoYFiuCseBhfKj2YKi5QQZ'},
  {'message': 'success',
   'status': 200,
   'partner_inference_id': 'doF8MEEnBvWr6JtknQDpu2'},
  {'message': 'success',
   'status': 200,
   'partner_infe