In [32]:
from arthurai import ArthurAI, ModelType, InputType, Stage
import numpy as np
import joblib
import datetime
import time

In [5]:
import sys
sys.path.append("../src")
from model_utils import transformations, load_datasets

In this guide, we'll use the credit dataset (and a pre-trained model) to onboard a new model to the Arthur platform. We'll walk through registering the model using a sample of the training data. 

#### Set up connection
Supply your API Key below to autheticate with the platform.

In [38]:
config = {"url": "dev.arthur.ai", "access_key":"..."}
connection = ArthurAI(config)

## Create Model

We'll instantiate a model object with a small amount of metadata about the model input and output types. Then, we'll use a sample of the training data to register the full data schema for this Tabular model.

In [21]:
arthur_model = connection.model(name="CreditRiskModel",
                               input_type=InputType.Tabular,
                               model_type=ModelType.Multiclass)

In [22]:
(X_train, Y_train), (X_test, Y_test) = load_datasets("../fixtures/datasets/credit_card_default.csv")

In [43]:
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
26301,130000,2,3,2,35,0,0,0,0,0,...,5771,8659,10503,11329,1000,1000,3000,2000,1000,1000
24009,30000,1,1,2,24,0,0,0,2,0,...,6214,3311,4430,906,1440,2259,0,1500,425,895
21057,160000,1,2,2,38,4,3,2,2,3,...,99911,103928,101540,99587,0,5500,6700,0,27,2800
4454,30000,2,1,2,22,0,0,0,2,3,...,11238,9453,9022,8237,1190,1858,800,0,0,500
23709,20000,2,1,2,54,0,0,0,0,0,...,18298,18761,17781,10674,1568,1737,1200,0,0,0


In [23]:
arthur_model.from_dataframe(transformations(X_train), Stage.ModelPipelineInput)
arthur_model.from_dataframe(Y_train, Stage.GroundTruth)
arthur_model.set_positive_class(1)

Before saving, you can review a model to make sure everything is correct.

In [19]:
arthur_model.review_model()

name       stage                     data_type         categorical  is_unique  
LIMIT_BAL  Stage.ModelPipelineInput  DataType.Float    False        True       
SEX        Stage.ModelPipelineInput  DataType.Integer  True         False      
EDUCATION  Stage.ModelPipelineInput  DataType.Integer  True         False      
MARRIAGE   Stage.ModelPipelineInput  DataType.Integer  True         False      
AGE        Stage.ModelPipelineInput  DataType.Integer  False        True       
PAY_0      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_2      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_3      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_4      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_5      Stage.ModelPipelineInput  DataType.Integer  True         False      
PAY_6      Stage.ModelPipelineInput  DataType.Integer  True         False      
BILL_AMT1  Stage.ModelPipelineInput  Dat

In [24]:
arthur_model.save()

'424cd60f-a1c5-4b92-8e78-300d8a30e9bb'

## Sending Inferences

Load test data and trained model. Ket's familiarize ourselves with the data and the model.


In [27]:
X_test.shape
sk_model = joblib.load("../fixtures/serialized_models/credit_model.pkl")

In [42]:
sk_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
sk_model.predict_proba(X_train.iloc[0:1, :])

array([[0.87515184, 0.12484816]])

To send inferences, we'll iterate through datapoints in a test set and send telemetry to Arthur.

In [None]:
for i in range(X_test.shape[0]):
    datarecord = transformations(X_test.iloc[i:i+1, :])
    prediction = sk_model.predict_proba(transformations(datarecord))[0, 1]
    ground_truth = Y_test.iloc[i]
    ext_id = str(np.random.randint(1e9))


    arthur_model.send_inference(
        inference_timestamp=datetime.datetime.utcnow(),
        external_id=ext_id,
        model_pipeline_input=datarecord.to_dict(orient='records')[0],
        predicted_value=arthur_model.binarize({1: prediction}),
        ground_truth=arthur_model.one_hot_encode(ground_truth)
    )
    print("Sent inference with id {}".format(ext_id))
    time.sleep(0.001 * np.random.random())