In [29]:
from arthurai import ArthurAI, ModelType, InputType, Stage
import numpy as np
import joblib
import datetime
import time

In [30]:
import sys
sys.path.append("..")
from model_utils import transformations, load_datasets

In this guide, we'll use the credit dataset (and a pre-trained model) to onboard a new model to the Arthur platform. We'll walk through registering the model using a sample of the training data. 

#### Set up connection
Supply your API Key below to autheticate with the platform.

In [21]:
URL = "dashboard.arthur.ai"
ACCESS_KEY = "eyJhbGciOiJIUzUxMiIsImlhdCI6MTU4ODYyMzM2NCwiZXhwIjoxNTg4NjIzOTY0fQ.eyJpZCI6MTh9.fktFXWMRyc95OMtip9OABGITRph4H__nUtbr8zLWi580RKPMg_zhxrmcKNIrmp-UmSjV-1F1pSnIgV7dzvz02w"

config = {"url": URL, "access_key":ACCESS_KEY}
connection = ArthurAI(config)

## Create Model

We'll instantiate a model object with a small amount of metadata about the model input and output types. Then, we'll use a sample of the training data to register the full data schema for this Tabular model.

In [22]:
arthur_model = connection.model(name="CreditRiskModel_test_v1.0.6",
                               input_type=InputType.Tabular,
                               model_type=ModelType.Multiclass)

In [23]:
(X_train, Y_train), (X_test, Y_test) = load_datasets("../fixtures/datasets/credit_card_default.csv")

In [6]:
Y_train.head()

26917    0
25425    0
16118    0
9508     0
12653    0
Name: default payment next month, dtype: int64

In [6]:
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
16763,20000,1,2,1,38,0,0,0,0,0,...,17012,17992,18350,18738,1565,1299,1279,637,663,1469
17133,20000,2,3,1,27,3,2,0,0,0,...,16892,17396,14017,0,0,1696,1200,22,0,0
22795,260000,2,2,1,33,0,0,0,0,0,...,135593,120909,102524,40157,4002,6067,10000,3000,40157,1466
21984,30000,2,3,2,28,-1,-1,-1,-1,-1,...,557,1299,600,450,25460,1000,1306,600,0,11961
11879,70000,2,2,2,24,0,0,-2,-1,-1,...,9660,6208,702,4320,1000,9660,6208,702,4320,1650


In [24]:
arthur_model.from_dataframe(X_train, Stage.ModelPipelineInput)
arthur_model.from_dataframe(Y_train, Stage.GroundTruth)
arthur_model.set_positive_class(1)

Before saving, you can review a model to make sure everything is correct.

In [25]:
arthur_model.review_model()

name       stage                     data_type         categorical  is_unique  
LIMIT_BAL  Stage.ModelPipelineInput  DataType.Integer  False        True       
SEX        Stage.ModelPipelineInput  DataType.Integer  True         False      
EDUCATION  Stage.ModelPipelineInput  DataType.Integer  True         False      
MARRIAGE   Stage.ModelPipelineInput  DataType.Integer  True         False      
AGE        Stage.ModelPipelineInput  DataType.Integer  False        True       
PAY_0      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_2      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_3      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_4      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_5      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_6      Stage.ModelPipelineInput  DataType.Integer  True         False      
BILL_AMT1  Stage.ModelPipelineInput  Dat

In [26]:
arthur_model.save()

'ee685753-5ba5-4e14-8ba9-5e08170540f7'

### Setting baseline data
Next, we'll use the training data to set a baseline refernce for calcuating data drift. Note, this step is optional. If you don't upload a reference set, Arthur will use the first 5000 inferences to set the baseline.

In [None]:
arthur_model.set_reference_data(stage=Stage.ModelPipelineInput, data=X_train)

## Sending Inferences

Load test data and trained model. Ket's familiarize ourselves with the data and the model.


In [14]:
X_test.shape
sk_model = joblib.load("../fixtures/serialized_models/credit_model.pkl")

In [15]:
sk_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
sk_model.predict_proba(X_train.iloc[0:1, :])

array([[0.77384266, 0.22615734]])

To send inferences, we'll iterate through datapoints in a test set and send telemetry to Arthur.

In [17]:
for i in range(X_test.shape[0]):
    datarecord = X_test.iloc[i:i+1, :]
    prediction = sk_model.predict_proba((datarecord))[0, 1]
    ground_truth = Y_test.iloc[i]
    ext_id = str(np.random.randint(1e9))


    arthur_model.send_inference(
        inference_timestamp=datetime.datetime.utcnow(),
        external_id=ext_id,
        model_pipeline_input=datarecord.to_dict(orient='records')[0],
        predicted_value=arthur_model.binarize({1: prediction}),
        ground_truth=arthur_model.one_hot_encode(ground_truth)
    )
    print("Sent inference with id {}".format(ext_id))
    time.sleep(0.001 * np.random.random())

Sent inference with id 648229712
Sent inference with id 867370201
Sent inference with id 588977141
Sent inference with id 931286790
Sent inference with id 779680422
Sent inference with id 611010009
Sent inference with id 180097604
Sent inference with id 513619022
Sent inference with id 733646379


KeyboardInterrupt: 