In [44]:
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

In [45]:
from arthurai import ArthurAI
from arthurai.client.apiv3 import InputType, OutputType, Stage, AttributeCategory, AttributeBin, ValueType

## Set up connection to API

In [46]:
URL = "v3.dev.arthur.ai"
ACCESS_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhdXRob3JpemVkIjp0cnVlLCJjb250ZXh0cyI6W3siY29udGV4dF9pZCI6IjRmMjk0NmE0LWExZjUtNDY4Mi04NDc5LTc5ZDJiNWE5MTkzYyIsImNvbnRleHRfdHlwZSI6Ik9yZ2FuaXphdGlvbiIsInJvbGUiOiJNb2RlbCBPd25lciJ9XSwiZXhwIjoxNjU3ODI0MTIwfQ.HCagGKGZmE8_F638BDeYTpR0W2DWrGPd4e_b81_p5s4"

connection = ArthurAI(url=URL, access_key=ACCESS_KEY, client_version=3)

In [85]:
model = connection.model(partner_model_id="MEPS_drift_2",
                         input_type=InputType.Tabular,
                         output_type=OutputType.Multiclass,
                         is_batch=True)

# model = connection.get_model('MEPS_drift_1', id_type='partner_model_id')

## Set up ArthurModel
Everything under this header is *necessary* for an `ArthurModel` to be created; additional functionality is not 
possible until after `model.save()` has been successfully called.

*Some context about this dataset:*

- label: `UTILIZATION`, where 1: >10 visits, 0: <10 visits

- protected attribute: `RACE`, where 1: `White`, 0: `Non-White`

In [101]:
# train_data_full contains the X's and the Y's
all_data = pd.read_parquet('fulldata_train.parquet')

In [87]:
all_data.columns.tolist()[-3:]

In [88]:
# add the attributes used for training to the model and set them to ModelPipelineInput
# note that attribute names need to contain only letters, numbers, and underscores, and cannot begin with a number
train_x = all_data.drop(columns=['RACE','p_0', 'p_1', 'gt'])
model.from_dataframe(train_x, Stage.ModelPipelineInput)

In [89]:
sens_x = pd.DataFrame(data={'RACE': all_data['RACE']})
model.from_dataframe(sens_x, Stage.NonInputData)

In [90]:
# gt_<>_utilization will refer to the ground truth values (i.e. true labels), while
# pred_<>_utilization will refer to the predicted outputs of your model. 
pred_to_ground_truth_map = {
    "pred_high_utilization": "gt_high_utilization",
    "pred_low_utilization": "gt_low_utilization"
}

# add the ground truth and predicted attributes to the model
model.add_binary_classifier_output_attributes('pred_high_utilization', pred_to_ground_truth_map)

{'pred_high_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7ffc9fb29ca0>,
 'gt_high_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7ffc9fad6a30>,
 'pred_low_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7ffc9fb29cd0>,
 'gt_low_utilization': <arthurai.client.apiv3.attributes.ArthurAttribute at 0x7ffc9fb35070>}

In [91]:
# check all the attributes loaded to the model. note that even though you passed in your "real" training
# data, currently the model has no reference to the actual datapoints -- just the properties of the 
# attributes. you will need to set reference data (for data drift detection) later
model.review()

Unnamed: 0,name,stage,value_type,categorical,is_unique,categories,range,monitor_for_bias
0,gt_high_utilization,GROUND_TRUTH,INTEGER,True,False,"[{value: 0}, {value: 1}]","[None, None]",False
1,gt_low_utilization,GROUND_TRUTH,INTEGER,True,False,"[{value: 0}, {value: 1}]","[None, None]",False
2,RACE,NON_INPUT_DATA,FLOAT,False,False,[],"[0.0, 1.0]",False
3,AGE,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 85.0]",False
4,PCS42,PIPELINE_INPUT,FLOAT,False,False,[],"[-9.0, 62.56]",False
...,...,...,...,...,...,...,...,...
137,INSCOV1,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
138,INSCOV2,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
139,INSCOV3,PIPELINE_INPUT,FLOAT,False,False,[],"[0.0, 1.0]",False
140,pred_high_utilization,PREDICTED_VALUE,FLOAT,False,False,[],"[0, 1]",False


In [92]:
# your model is not uploaded to Arthur until you call model.save()
model.save()
# model.update()

'c0c2a18d-ec87-4b8f-a9af-165543a0c4f4'

## Adding additional functionality

Setting reference data and monitoring for bias adds to the functionality of the `ArthurModel` that you've created. 

### Setting reference data

Now that we've saved the model, we can set reference data.
The reference data df must have a column for each `ModelPipelineInput` and each `NonInput`. Optionally, it 
can contain predicted value and ground truth columns; this will enable the calculation of data drift 
on output attributes. In this model, our ground truth columns are `gt_high_utilization` and `gt_low_utilization`,
and our predicted value columns are `pred_high_utilization` and `pred_low_utilization`. 

We need to rename and update the columns in `all_data` to match these. 

In [102]:
# ['p_0', 'p_1', 'gt']
all_data = all_data.rename(columns={'p_0': 'pred_low_utilization', 'p_1': 'pred_high_utilization'})
gt = pd.get_dummies(all_data['gt'], prefix='gt', dtype='int')
gt = gt.rename(columns={'gt_0.0': 'gt_low_utilization', 'gt_1.0': 'gt_high_utilization'})
all_data = all_data.drop(columns=['gt'])
all_data = all_data.merge(gt, left_index=True, right_index=True)

In [103]:
all_data.dtypes

AGE                      float64
RACE                     float64
PCS42                    float64
MCS42                    float64
K6SUM42                  float64
                          ...   
INSCOV3                  float64
pred_low_utilization     float64
pred_high_utilization    float64
gt_low_utilization         int64
gt_high_utilization        int64
Length: 142, dtype: object

In [104]:
model.set_reference_data(data=all_data) # note: need to explicitly specify the data= argument

{'counts': {'success': 161, 'failure': 0, 'total': 161}, 'failures': [[]]}

### setting bias monitoring

In [105]:
model.get_attribute('RACE', stage=Stage.NonInputData).monitor_for_bias=True

### send batches of inferences

We do not need to load/expose the actual predictive model to send inferences to `arthur_model`. `x_shift` is a directory holding batched data. Each of the parquet files is a dataframe with the following columns:

- sensitive attribute, `RACE`
- all attributes used for training
- `p_0`: the predicted probability of this example being in the 0 class - *given by your model*
- `p_1`: the predicted probability of this example being in the 1 class - *given by your model*
- `gt`: the ground truth label. 

In [106]:
%ls x_shift 

[0m[01;32mfulldata_0.parquet[0m*  [01;32mfulldata_3.parquet[0m*  [01;32mfulldata_6.parquet[0m*
[01;32mfulldata_1.parquet[0m*  [01;32mfulldata_4.parquet[0m*  [01;32mfulldata_7.parquet[0m*
[01;32mfulldata_2.parquet[0m*  [01;32mfulldata_5.parquet[0m*


##### batch inference format
(df option)
Your batch inferences must have the dummied prediction columns that you created earlier; 
if you don't have ground truth labels at this time, you don't need to send them now.
When sending batch inferences, you must add the following additional columns: 
- `batch_id` - string
- `partner_inference_id`- string
- `inference_timestamp` - datetime.datetime

In [111]:
for i in range(4): # sending inferences without ground truth
    alldata = pd.read_parquet('x_shift/fulldata_'+str(i) +'.parquet')
    infs = alldata[['p_0', 'p_1']]
    tosend = infs.rename(columns={'p_0': 'pred_low_utilization', 'p_1': 'pred_high_utilization'})
    
    tosend['batch_id'] = [str(i)]*len(tosend)
    tosend['partner_inference_id'] = [str(np.random.randint(10000))]*len(tosend)
    tosend['inference_timestamp'] = [(datetime.now())]*len(tosend)
    
    model.send_batch_inferences(data=tosend)

In [129]:
for i in range(4): # adding ground truth later
    alldata = pd.read_parquet('x_shift/fulldata_'+str(i) +'.parquet')
    tosend = pd.get_dummies(alldata['gt'], prefix='gt')
    tosend = tosend.rename(columns={'gt_0.0': 'gt_low_utilization', 'gt_1.0': 'gt_high_utilization'})

    tosend['batch_id'] = [str(i)]*len(tosend)
    tosend['partner_inference_id'] = [str(np.random.randint(10000))]*len(tosend)
    tosend['inference_timestamp'] = [(datetime.now())]*len(tosend)
    
    model.send_batch_inferences(data=tosend)

In [131]:
for i in range(4): # adding gt and infs at the same time
    alldata = pd.read_parquet('x_shift/fulldata_'+str(i+4) +'.parquet')
    infs = alldata[['p_0', 'p_1']]
    tosend = infs.rename(columns={'p_0': 'pred_low_utilization', 'p_1': 'pred_high_utilization'})
    
    gts = pd.get_dummies(alldata['gt'], prefix='gt')
    gts = gts.rename(columns={'gt_0.0': 'gt_low_utilization', 'gt_1.0': 'gt_high_utilization'})
    
    tosend = tosend.merge(gts, left_index=True, right_index=True)

    tosend['batch_id'] = [str(i)]*len(tosend)
    tosend['partner_inference_id'] = [str(np.random.randint(10000))]*len(tosend)
    tosend['inference_timestamp'] = [(datetime.now())]*len(tosend)
    
    model.send_batch_inferences(data=tosend)

In [132]:
model.id

'c0c2a18d-ec87-4b8f-a9af-165543a0c4f4'