In [1]:
from arthurai import ArthurAI, ModelType, InputType, Stage
import numpy as np
import joblib
import datetime
import time

In [2]:
import sys
sys.path.append("../src")
from model_utils import transformations, load_datasets

In this guide, we'll use the credit dataset (and a pre-trained model) to onboard a new model to the Arthur platform. We'll walk through registering the model using a sample of the training data. 

#### Set up connection
Supply your API Key below to autheticate with the platform.

In [3]:
config = {"url": "dev.arthur.ai", "access_key":"..."}
connection = ArthurAI(config)

## Create Model

We'll instantiate a model object with a small amount of metadata about the model input and output types. Then, we'll use a sample of the training data to register the full data schema for this Tabular model.

In [9]:
arthur_model = connection.model(name="CreditRiskModel_test_v1.0.2",
                               input_type=InputType.Tabular,
                               model_type=ModelType.Multiclass)

In [10]:
(X_train, Y_train), (X_test, Y_test) = load_datasets("../fixtures/datasets/credit_card_default.csv")

In [11]:
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
5537,150000,1,2,1,29,0,0,0,0,0,...,81085,75300,77276,64065,30151,40104,10013,5083,1819,4793
2439,120000,2,3,1,48,-2,-2,-2,-2,-2,...,506,2097,758,2005,7890,506,2097,758,2005,360
20130,80000,2,2,1,43,2,0,0,0,0,...,71846,72372,73475,74842,3100,3600,2720,2400,2224,3990
17804,170000,1,2,2,27,0,0,0,0,0,...,134212,133450,127020,123490,5000,5508,5915,3000,4000,4000
22901,30000,2,2,1,43,0,0,0,2,2,...,17210,16636,17908,17482,1240,3300,0,1548,0,637


In [12]:
arthur_model.from_dataframe(X_train, Stage.ModelPipelineInput)
arthur_model.from_dataframe(Y_train, Stage.GroundTruth)
arthur_model.set_positive_class(1)

Before saving, you can review a model to make sure everything is correct.

In [13]:
arthur_model.review_model()

name       stage                     data_type         categorical  is_unique  
LIMIT_BAL  Stage.ModelPipelineInput  DataType.Integer  False        True       
SEX        Stage.ModelPipelineInput  DataType.Integer  True         False      
EDUCATION  Stage.ModelPipelineInput  DataType.Integer  True         False      
MARRIAGE   Stage.ModelPipelineInput  DataType.Integer  True         False      
AGE        Stage.ModelPipelineInput  DataType.Integer  False        True       
PAY_0      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_2      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_3      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_4      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_5      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_6      Stage.ModelPipelineInput  DataType.Integer  True         False      
BILL_AMT1  Stage.ModelPipelineInput  Dat

In [14]:
arthur_model.save()

'ab7bb045-7ee8-44b6-aed5-515df71b085c'

## Sending Inferences

Load test data and trained model. Ket's familiarize ourselves with the data and the model.


In [15]:
X_test.shape
sk_model = joblib.load("../fixtures/serialized_models/credit_model.pkl")

In [16]:
sk_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
sk_model.predict_proba(pipeline(X_train.iloc[0:1, :]))

array([[0.95342881, 0.04657119]])

To send inferences, we'll iterate through datapoints in a test set and send telemetry to Arthur.

In [19]:
for i in range(X_test.shape[0]):
    datarecord = X_test.iloc[i:i+1, :]
    prediction = sk_model.predict_proba((datarecord))[0, 1]
    ground_truth = Y_test.iloc[i]
    ext_id = str(np.random.randint(1e9))


    arthur_model.send_inference(
        inference_timestamp=datetime.datetime.utcnow(),
        external_id=ext_id,
        model_pipeline_input=datarecord.to_dict(orient='records')[0],
        predicted_value=arthur_model.binarize({1: prediction}),
        ground_truth=arthur_model.one_hot_encode(ground_truth)
    )
    print("Sent inference with id {}".format(ext_id))
    time.sleep(0.001 * np.random.random())

Sent inference with id 861924787
Sent inference with id 241618639
Sent inference with id 798351879
Sent inference with id 268050762
Sent inference with id 701609118
Sent inference with id 577778212
Sent inference with id 724451666
Sent inference with id 718151654
Sent inference with id 856295372
Sent inference with id 750345664
Sent inference with id 283238622
Sent inference with id 509613165
Sent inference with id 336781229
Sent inference with id 752954389
Sent inference with id 171957297
Sent inference with id 544496651
Sent inference with id 170226032
Sent inference with id 525421810
Sent inference with id 9709155
Sent inference with id 576639986
Sent inference with id 916534499
Sent inference with id 306196793
Sent inference with id 321197544
Sent inference with id 936889645
Sent inference with id 173484457
Sent inference with id 52535056
Sent inference with id 995913014
Sent inference with id 139332598
Sent inference with id 596832383
Sent inference with id 677341944
Sent inferenc

KeyboardInterrupt: 