In [2]:
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

In [3]:
from arthurai import ArthurAI
from arthurai.client.apiv3 import InputType, OutputType, Stage, AttributeCategory, AttributeBin, ValueType

## Set up connection to API

In [4]:
URL = "app.arthur.ai"
ACCESS_KEY = ""

connection = ArthurAI(url=URL, access_key=ACCESS_KEY, client_version=3)

In [5]:
# model = connection.model(partner_model_id="MEPS_drift_1",
#                          input_type=InputType.Tabular,
#                          output_type=OutputType.Multiclass,
#                          is_batch=True)

model = connection.get_model('MEPS_drift_1', id_type='partner_model_id')

## Set up ArthurModel
Everything under this header is *necessary* for an `ArthurModel` to be created; additional functionality is not 
possible until after `model.save()` has been successfully called.

*Some context about this dataset:*

- label: `UTILIZATION`, where 1: >10 visits, 0: <10 visits

- protected attribute: `RACE`, where 1: `White`, 0: `Non-White`

In [6]:
# train_data_full contains the X's and the Y's
all_data = pd.read_parquet('fulldata_train.parquet')

In [7]:
all_data.columns.tolist()[-3:]

In [8]:
# add the attributes used for training to the model and set them to ModelPipelineInput
# note that attribute names need to contain only letters, numbers, and underscores, and cannot begin with a number
train_x = all_data.drop(columns=['RACE','p_0', 'p_1', 'gt'])
model.from_dataframe(train_x, Stage.ModelPipelineInput)

In [9]:
# add bias monitoring - this cannot be done after the model is saved
# to monitor for bias, 'RACE' must be an int (or take on discrete string values)
all_data['RACE'] = all_data['RACE'].astype(int)
sens_x = pd.DataFrame(data={'RACE': all_data['RACE']})
model.from_dataframe(sens_x, Stage.NonInputData)

model.get_attribute('RACE', stage=Stage.NonInputData).monitor_for_bias=True

In [10]:
# gt_<>_utilization will refer to the ground truth values (i.e. true labels), while
# pred_<>_utilization will refer to the predicted outputs of your model. 
pred_to_ground_truth_map = {
    "pred_high_utilization": "gt_high_utilization",
    "pred_low_utilization": "gt_low_utilization"
}

# add the ground truth and predicted attributes to the model
model.add_binary_classifier_output_attributes('pred_high_utilization', pred_to_ground_truth_map)

{'pred_high_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7f87b9d88040>,
 'gt_high_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7f87b9d455b0>,
 'pred_low_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7f87c411a190>,
 'gt_low_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7f87b9d98820>}

In [14]:
# check all the attributes loaded to the model. note that even though you passed in your "real" training
# data, currently the model has no reference to the actual datapoints -- just the properties of the 
# attributes. you will need to set reference data (for data drift detection) later
model.review()

Unnamed: 0,name,stage,value_type,categorical,is_unique,categories,range,monitor_for_bias
0,gt_low_utilization,GROUND_TRUTH,INTEGER,True,False,"[{value: 0}, {value: 1}]","[None, None]",False
1,gt_high_utilization,GROUND_TRUTH,INTEGER,True,False,"[{value: 0}, {value: 1}]","[None, None]",False
2,RACE,NON_INPUT_DATA,INTEGER,True,False,"[{value: 0}, {value: 1}]","[None, None]",True
3,PHQ24201,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
4,SEX2,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
...,...,...,...,...,...,...,...,...
137,OHRTDX2,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
138,PREGNT01,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
139,RTHLTH5,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
140,pred_low_utilization,PREDICTED_VALUE,FLOAT,False,False,[],"[0, 1]",False


In [13]:
# your model is not uploaded to Arthur until you call model.save()
# model.save()
# model.update()

Exception: An error occurred: {'error': 'cannot update model due to invalid attributes: attribute names must be unique for a specific model, AGE is defined more then once'}

## Adding additional functionality

Setting reference data and monitoring for bias adds to the functionality of the `ArthurModel` that you've created. 

### Setting reference data

Now that we've saved the model, we can set reference data.
The reference data df must have a column for each `ModelPipelineInput` and each `NonInput`. Optionally, it 
can contain predicted value and ground truth columns; this will enable the calculation of data drift 
on output attributes. In this model, our ground truth columns are `gt_high_utilization` and `gt_low_utilization`,
and our predicted value columns are `pred_high_utilization` and `pred_low_utilization`. 

We need to rename and update the columns in `all_data` to match these. 

In [15]:
# ['p_0', 'p_1', 'gt']
all_data = all_data.rename(columns={'p_0': 'pred_low_utilization', 'p_1': 'pred_high_utilization'})
gt = pd.get_dummies(all_data['gt'], prefix='gt', dtype='int')
gt = gt.rename(columns={'gt_0.0': 'gt_low_utilization', 'gt_1.0': 'gt_high_utilization'})
all_data = all_data.drop(columns=['gt'])
all_data = all_data.merge(gt, left_index=True, right_index=True)

In [16]:
all_data.dtypes

AGE                      float64
RACE                       int64
PCS42                    float64
MCS42                    float64
K6SUM42                  float64
                          ...   
INSCOV3                  float64
pred_low_utilization     float64
pred_high_utilization    float64
gt_low_utilization         int64
gt_high_utilization        int64
Length: 142, dtype: object

In [17]:
model.set_reference_data(data=all_data) # note: need to explicitly specify the data= argument

{'counts': {'success': 161, 'failure': 0, 'total': 161}, 'failures': [[]]}

### setting bias monitoring

### send batches of inferences

We do not need to load/expose the actual predictive model to send inferences to `arthur_model`. `x_shift` is a directory holding batched data. Each of the parquet files is a dataframe with the following columns:

- sensitive attribute, `RACE`
- all attributes used for training
- `p_0`: the predicted probability of this example being in the 0 class - *given by your model*
- `p_1`: the predicted probability of this example being in the 1 class - *given by your model*
- `gt`: the ground truth label. 

In [18]:
%ls x_shift 

[0m[01;32mfulldata_0.parquet[0m*  [01;32mfulldata_3.parquet[0m*  [01;32mfulldata_6.parquet[0m*
[01;32mfulldata_1.parquet[0m*  [01;32mfulldata_4.parquet[0m*  [01;32mfulldata_7.parquet[0m*
[01;32mfulldata_2.parquet[0m*  [01;32mfulldata_5.parquet[0m*


##### batch inference format
(df option)
Your batch inferences must have the dummied prediction columns that you created earlier; 
if you don't have ground truth labels at this time, you don't need to send them now.
When sending batch inferences, you must add the following additional columns: 
- `batch_id` - string
- `partner_inference_id`- string
- `inference_timestamp` - datetime.datetime

In [24]:
%ls -l x_shift

total 804
-rwxrwxrwx 1 jessica jessica 101767 Sep 29 13:15 [0m[01;32mfulldata_0.parquet[0m*
-rwxrwxrwx 1 jessica jessica 101252 Sep 29 13:15 [01;32mfulldata_1.parquet[0m*
-rwxrwxrwx 1 jessica jessica 100909 Sep 29 13:15 [01;32mfulldata_2.parquet[0m*
-rwxrwxrwx 1 jessica jessica 100660 Sep 29 13:15 [01;32mfulldata_3.parquet[0m*
-rwxrwxrwx 1 jessica jessica 101455 Sep 29 13:15 [01;32mfulldata_4.parquet[0m*
-rwxrwxrwx 1 jessica jessica 102370 Sep 29 13:15 [01;32mfulldata_5.parquet[0m*
-rwxrwxrwx 1 jessica jessica 102213 Sep 29 13:15 [01;32mfulldata_6.parquet[0m*
-rwxrwxrwx 1 jessica jessica 102689 Sep 29 13:15 [01;32mfulldata_7.parquet[0m*


In [31]:
for i in range(4): # sending inferences without ground truth
    alldata = pd.read_parquet('x_shift/fulldata_'+str(i) +'.parquet')
    tosend = alldata.rename(columns={'p_0': 'pred_low_utilization', 'p_1': 'pred_high_utilization'})
    tosend['RACE'] = tosend['RACE'].astype(int)
    
    tosend['batch_id'] = [str(i)]*len(tosend)
    tosend['partner_inference_id'] = [str(np.random.randint(10000))]*len(tosend)
    tosend['inference_timestamp'] = [(datetime.now())]*len(tosend)
    
    model.send_batch_inferences(data=tosend)

In [32]:
for i in range(4): # adding ground truth later
    alldata = pd.read_parquet('x_shift/fulldata_'+str(i) +'.parquet')
    tosend = pd.get_dummies(alldata['gt'], prefix='gt')
    tosend = tosend.rename(columns={'gt_0.0': 'gt_low_utilization', 'gt_1.0': 'gt_high_utilization'})

    tosend['batch_id'] = [str(i)]*len(tosend)
    tosend['partner_inference_id'] = [str(np.random.randint(10000))]*len(tosend)
    tosend['inference_timestamp'] = [(datetime.now())]*len(tosend)
    
    model.send_batch_inferences(data=tosend)

In [36]:
for i in range(4): # adding gt and infs at the same time
    alldata = pd.read_parquet('x_shift/fulldata_'+str(i+4) +'.parquet')
    tosend = alldata.rename(columns={'p_0': 'pred_low_utilization', 'p_1': 'pred_high_utilization'})
    tosend['RACE'] = tosend['RACE'].astype(int)
    
    gts = pd.get_dummies(alldata['gt'], prefix='gt')
    gts = gts.rename(columns={'gt_0.0': 'gt_low_utilization', 'gt_1.0': 'gt_high_utilization'})
    
    tosend = tosend.merge(gts, left_index=True, right_index=True)

    tosend['batch_id'] = [str(i+4)]*len(tosend)
    tosend['partner_inference_id'] = [str(np.random.randint(10000))]*len(tosend)
    tosend['inference_timestamp'] = [(datetime.now())]*len(tosend)
    
    model.send_batch_inferences(data=tosend)

In [37]:
tosend.head()

Unnamed: 0,AGE,RACE,PCS42,MCS42,K6SUM42,REGION=1,REGION=2,REGION=3,REGION=4,SEX=1,...,INSCOV=2,INSCOV=3,gt,pred_low_utilization,pred_high_utilization,gt_low_utilization,gt_high_utilization,batch_id,partner_inference_id,inference_timestamp
0,46.0,0,65.869469,40.166361,2.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.952468,0.047532,1,0,7,2158,2020-09-29 13:42:38.131723
1,11.0,0,5.196581,-3.431802,-1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.953807,0.046193,1,0,7,2158,2020-09-29 13:42:38.131723
2,58.0,0,57.969287,52.02732,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.861396,0.138604,1,0,7,2158,2020-09-29 13:42:38.131723
3,24.0,0,64.512119,56.755231,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.965449,0.034551,1,0,7,2158,2020-09-29 13:42:38.131723
4,5.0,1,5.3401,-1.430711,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.793208,0.206792,1,0,7,2158,2020-09-29 13:42:38.131723


In [34]:
model.id

'e674527e-4402-4c2a-9513-f06b579d49b1'

In [35]:
%pwd

'/mnt/c/Users/jessica/Documents/github/arthur/demo_client/notebooks/healthcare_shift'