In [60]:
from opsml_artifacts import SnowflakeQueryRunner, DataCard, CardRegistry

In [2]:
query_runner = SnowflakeQueryRunner(on_vpn=True) 
#query runner is a temporary wrapper for pyshipt sql (needed for network issues in vertex, see opsml-pipelines docs)

In [3]:
dataframe = query_runner.query_to_dataframe(sql_file="data.sql") 
#executes sql file or raw sql. data.sql is in examples dir

In [4]:
dataframe.head()

Unnamed: 0,TIME_BUNDLE_ID,LOCAL_CREATED_DATE,LOCAL_CREATED_AT,METRO_ID,BUNDLE_TYPE,NBR_ADDRESSES,NBR_ORDERS,DROP_OFF_TIME,EVAL_FLG,EVAL_OUTLIER,...,COS_DAY_YR,SIN_MTH,COS_MTH,SIN_WK_YR,COS_WK_YR,NG_ORDER_ID,METRO_X,METRO_Y,METRO_Z,APT_FLG
0,bcad5d7c-94c3-40a1-b326-9b853aa062ca,2022-04-26,2022-04-26 15:29:25.393948,79,TARP,1,1,2.46793,0,0,...,-0.413277,0.866026,-0.499998,0.885456,-0.464723,E_126706033,176.254764,-4740.574812,4252.708044,0
1,ffb282a6-960d-4e6d-b358-b01686ffb2a7,2022-05-01,2022-05-01 11:48:11.247692,209,TARP,1,1,3.108182,0,0,...,-0.490029,0.499998,-0.866026,0.885456,-0.464723,E_127411582,1385.224488,-4620.989763,4161.399732,0
2,87f75002-a99e-409c-badf-8ffe5606bbdd,2022-04-29,2022-04-29 15:04:14.340214,145,TARP,2,2,6.893772,0,0,...,-0.459731,0.866026,-0.499998,0.885456,-0.464723,,-2187.649703,-4657.255851,3756.83347,0
3,20b248b3-0b75-4f35-9f7d-b48b76b8e153,2022-04-30,2022-04-30 08:37:55.893871,198,TARP,2,2,9.134985,0,0,...,-0.47495,0.866026,-0.499998,0.885456,-0.464723,,1309.435295,-4586.262199,4223.886747,0
4,16c860b1-2bc8-4210-b379-80c479a22c39,2022-04-30,2022-04-30 13:54:02.871898,256,TARP,1,1,2.52999,0,0,...,-0.47495,0.866026,-0.499998,0.885456,-0.464723,E_127310775,-2676.967229,-4303.653229,3860.318165,0


In [5]:
features = [
    "NBR_ADDRESSES",
    "NBR_ORDERS",
    "NBR_RX",
    "NBR_APT",
    "METRO_X",
    "METRO_Y",
    "METRO_Z",
    "APT_FLG",
    "DROP_OFF_TIME",
    "EVAL_FLG",
]
DEPENDENT_VAR = "DROP_OFF_TIME"

# Define DataCard attributes (see examples dir for more detailed information)
DATA_NAME = "tarp_drop_off"
TEAM = "SPMS"
USER_EMAIL = "steven.forrester@shipt.com"
DATA_SPLITS = [
    {"label": "train", "column": "EVAL_FLG", "column_value": 0},
    {"label": "test", "column": "EVAL_FLG", "column_value": 1},
]

data_card = DataCard(
    data=dataframe[features],
    name=DATA_NAME,
    team=TEAM,
    user_email=USER_EMAIL,
    data_splits=DATA_SPLITS,
    dependent_vars=[DEPENDENT_VAR],
)

data_registry = CardRegistry(registry_name="data")
data_registry.register_card(card=data_card)

{"level": "INFO", "message": "Table: tarp_drop_off registered as version 6", "timestamp": "2023-01-24T01:45:44.736720Z", "app_env": "staging", "host": null, "version": null}


In [6]:
from opsml_artifacts import CardRegistry

data_registry = CardRegistry(registry_name="data")
tarp_list = data_registry.list_cards(team="SPMS", name="tarp_drop_off")

In [7]:
print(tarp_list.loc[:, ~tarp_list.columns.isin(["feature_map", "data_splits", "drift_uri"])].to_markdown())

|    | uid                              | date       |     timestamp | app_env   | name          | team   |   version | user_email                 | data_uri                                                                                                        | feature_descriptions   | data_type   | dependent_vars    |
|---:|:---------------------------------|:-----------|--------------:|:----------|:--------------|:-------|----------:|:---------------------------|:----------------------------------------------------------------------------------------------------------------|:-----------------------|:------------|:------------------|
|  0 | 0cb593c2c82d4a67876f3cd0988e208e | 2023-01-24 | 1674524706817 | staging   | tarp_drop_off | SPMS   |         6 | steven.forrester@shipt.com | gs://shipt-spms-stg-bucket/DATA_REGISTRTY/SPMS/tarp_drop_off/version-6/9efa1b4487b448b3ad1bab10f2fa4644.parquet |                        | DataFrame   | ['DROP_OFF_TIME'] |
|  1 | be3fc3c9ab874eb7915231d4c2a

In [8]:
loaded_card = data_registry.load_card(uid=data_card.uid) 
# load_card can take a few arguments. Be sure to check to docstring

In [9]:
print(loaded_card.data.head().to_markdown())

|    |   NBR_ADDRESSES |   NBR_ORDERS |   NBR_RX |   NBR_APT |   METRO_X |   METRO_Y |   METRO_Z |   APT_FLG |   DROP_OFF_TIME |   EVAL_FLG |
|---:|----------------:|-------------:|---------:|----------:|----------:|----------:|----------:|----------:|----------------:|-----------:|
|  0 |               1 |            1 |        0 |         0 |   176.255 |  -4740.57 |   4252.71 |         0 |         2.46793 |          0 |
|  1 |               1 |            1 |        0 |         0 |  1385.22  |  -4620.99 |   4161.4  |         0 |         3.10818 |          0 |
|  2 |               2 |            2 |        0 |         0 | -2187.65  |  -4657.26 |   3756.83 |         0 |         6.89377 |          0 |
|  3 |               2 |            2 |        0 |         0 |  1309.44  |  -4586.26 |   4223.89 |         0 |         9.13499 |          0 |
|  4 |               1 |            1 |        0 |         0 | -2676.97  |  -4303.65 |   3860.32 |         0 |         2.52999 |          0 |


In [10]:
from opsml_artifacts import ModelCard
from lightgbm import LGBMRegressor

data_splits = data_card.split_data()

In [11]:
data_splits = data_card.split_data() # get the data splits defined by split logic (data_card.data_splits)

# Prepare train data
data_splits.train.pop("EVAL_FLG") # pop off eval flg
y_train = data_splits.train.pop("DROP_OFF_TIME") # get train target

# Prepare test data
data_splits.test.pop("EVAL_FLG") # pop off eval flg
y_test = data_splits.test.pop("DROP_OFF_TIME") # get train target

In [13]:
from opsml_artifacts import ModelCard
from lightgbm import LGBMRegressor

model_registry = CardRegistry(registry_name="model") #load the model registry

data_splits = data_card.split_data() # get the data splits defined by split logic (data_card.data_splits)

# Prepare train data
data_splits.train.pop("EVAL_FLG") # pop off eval flg
y_train = data_splits.train.pop("DROP_OFF_TIME") # get train target

# Prepare test data
data_splits.test.pop("EVAL_FLG") # pop off eval flg
y_test = data_splits.test.pop("DROP_OFF_TIME") # get test target

# fit model
lgb_model = LGBMRegressor()
lgb_model.fit(data_splits.train, y_train)

model_card = ModelCard(
    trained_model=lgb_model,
    sample_input_data=data_splits.train[:1],
    name="tarp_lgb",
    team=TEAM, # defined above
    user_email=USER_EMAIL, # defined above
    data_card_uid=data_card.uid # this is required if you are planning on registering the model
)

{"level": "INFO", "message": "Registering lightgbm onnx converter", "timestamp": "2023-01-24T01:47:27.912424Z", "app_env": "staging", "host": null, "version": null}
{"level": "INFO", "message": "Validating converted onnx model", "timestamp": "2023-01-24T01:47:28.189532Z", "app_env": "staging", "host": null, "version": null}
{"level": "INFO", "message": "Onnx model validated", "timestamp": "2023-01-24T01:47:28.209281Z", "app_env": "staging", "host": null, "version": null}


In [14]:
model_registry = CardRegistry(registry_name="model")
model_registry.register_card(card=model_card)

{"level": "INFO", "message": "Table: tarp_lgb registered as version 3", "timestamp": "2023-01-24T01:48:16.385555Z", "app_env": "staging", "host": null, "version": null}


In [15]:
model_card = model_registry.load_card(uid=model_card.uid)
onnx_model = model_card.model()

In [16]:
onnx_model.input_sig.schema()

{'title': 'Features',
 'type': 'object',
 'properties': {'NBR_ADDRESSES': {'title': 'Nbr Addresses', 'type': 'integer'},
  'NBR_ORDERS': {'title': 'Nbr Orders', 'type': 'integer'},
  'NBR_RX': {'title': 'Nbr Rx', 'type': 'integer'},
  'NBR_APT': {'title': 'Nbr Apt', 'type': 'integer'},
  'METRO_X': {'title': 'Metro X', 'type': 'number'},
  'METRO_Y': {'title': 'Metro Y', 'type': 'number'},
  'METRO_Z': {'title': 'Metro Z', 'type': 'number'},
  'APT_FLG': {'title': 'Apt Flg', 'type': 'integer'}},
 'required': ['NBR_ADDRESSES',
  'NBR_ORDERS',
  'NBR_RX',
  'NBR_APT',
  'METRO_X',
  'METRO_Y',
  'METRO_Z',
  'APT_FLG']}

In [17]:
model_card.load_trained_model() # load trained model if laoding from registry

In [59]:
record = data_splits.test[0:1].to_dict(orient='records')[0]

# if testing a model that was trained on a numpy array, the model will expect a dictionary with a single list
# record = {"data": list(np.ravel(data[:1]))}

# test the onnx model 
onnx_pred = float(np.ravel(onnx_model.predict(record))[0])

# Compare to original model
orig_pred = float(onnx_model.predict_with_model(model_card.trained_model, record)[0])

print(f"Onnx: {round(onnx_pred,4)}", f"Lightgbm: {round(orig_pred,4)}")

Onnx: 5.1914 Lightgbm: 5.1914
