In [None]:
# follows tutorial found here
# https://cloud.google.com/bigquery/docs/making-predictions-with-sklearn-models-in-onnx-format

In [None]:
# other important gifts
# https://onnx.ai/sklearn-onnx/api_summary.html

# Installs

In [None]:
! pip install xgboost -U -q --user
! pip install skl2onnx -U -q --user

# Setup

In [38]:
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]
REGION = "us-central1"

# raw source data
BUCKET_NAME = f"bkt-{REGION}-data"
BUCKET_PATH = f"gs://{BUCKET_NAME}"
USE_CASE = "bq_inference_engine"

# model
MODEL_NAME = "calibration_model"

# BQ
BQ_DATASET = "ds_uscentral1"
BQ_TABLE = "calibration_test_set"
BQ_MODEL_NAME = f"bq_{MODEL_NAME}_pipeline"

# Train an XGBoost classification model with an scikit-learn calibration model

In [None]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss
import pandas as pd
import joblib
import json

In [30]:
# Load breast cancer dataset
data = datasets.load_breast_cancer(as_frame=True)
X, y = data.data, data.target

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

# Initialize and fit XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict probabilities for the test set
predicted_probabilities = xgb_model.predict_proba(X_test)[:, 1]

# Print Brier score before calibration
print('Brier score before calibration:', brier_score_loss(y_test, predicted_probabilities))

# Platt Scaling / Logistic calibration
x_predicted_probabilities = predicted_probabilities.reshape(-1, 1)
pipe = Pipeline([ ('lr', LogisticRegression() )])
pipe.fit(x_predicted_probabilities, y_test)

# Calibrated probabilities
calibrated_probs = pipe.predict_proba( x_predicted_probabilities )[:, 1]

# Print Brier score after calibration
print('Brier score after calibration:', brier_score_loss(y_test, calibrated_probs))

Brier score before calibration: 0.017434884531435425
Brier score after calibration: 0.020933944128659076




In [31]:
# Save the predicted probabilities and true labels as a new line delimited json
test_set = pd.DataFrame({'predicted_probabilities': predicted_probabilities, 'y_test': y_test})

# write out
calibration_test_set_name = f"{BQ_TABLE}.json"
test_set.to_json(calibration_test_set_name, orient='records', lines=True)

# save to GCS
calibration_test_set_uri = f"{BUCKET_PATH}/{USE_CASE}/{calibration_test_set_name}"
! gsutil cp {calibration_test_set_name} {calibration_test_set_uri}

Copying file://calibration_test_set.json [Content-Type=application/json]...
/ [1 files][  8.7 KiB/  8.7 KiB]                                                
Operation completed over 1 objects/8.7 KiB.                                      


In [32]:
# data to load into BigQuery
! head -n 2 calibration_test_set.json

{"predicted_probabilities":0.9928037524,"y_test":1}
{"predicted_probabilities":0.0002118868,"y_test":0}


In [37]:
def load_to_bq(PROJECT_ID, REGION, BQ_DATASET, BQ_TABLE, GCS_URI):
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client(location=REGION, project=PROJECT_ID)

    # TODO(developer): Set table_id to the ID of the table to create.
    table_id = f"{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}"

    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, )

    load_job = client.load_table_from_uri(
        GCS_URI,
        table_id,
        location=REGION,
        job_config=job_config, )

    assert load_job.job_type == "load"

    load_job.result()  # Waits for the job to complete.
    print('Job finished.')

    assert load_job.state == "DONE"
    destination_table = client.get_table(table_id)
    print('Loaded {} rows.'.format(destination_table.num_rows))
    
# Load to BQ
load_to_bq(PROJECT_ID, REGION, BQ_DATASET, BQ_TABLE, calibration_test_set_uri)

Job finished.
Loaded 171 rows.


# Convert the model into ONNX format and save

In [33]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [39]:
# Disable zipmap as it is not supported in BigQuery ML.
options = {id(pipe): {'zipmap': False}}

# Define input features. scikit-learn does not store information about the
# training dataset. It is not always possible to retrieve the number of features
# or their types. That's why the function needs another argument called initial_types. Example

# define initial types
initial_types = [('predicted_probabilities', FloatTensorType([None, 1]))]

# Convert the model.
model_onnx = convert_sklearn(
   pipe, MODEL_NAME, initial_types=initial_types, options=options
    , target_opset=17 # if not set, uses 18 which is unsupported
)

# Save the calibration model
calibration_model_name = f"{MODEL_NAME}.onnx"
with open(calibration_model_name, 'wb') as f:
    f.write(model_onnx.SerializeToString())

# Upload the ONNX model to Cloud Storage

In [40]:
calibration_model_uri = f"{BUCKET_PATH}/{USE_CASE}/{calibration_model_name}"
! gsutil cp {calibration_model_name} {calibration_model_uri}

Copying file://calibration_model.onnx [Content-Type=application/octet-stream]...
/ [1 files][  493.0 B/  493.0 B]                                                
Operation completed over 1 objects/493.0 B.                                      


# Import the ONNX model into BigQuery

In [41]:
def create_inference_engine_model(PROJECT_ID, REGION, BQ_DATASET, BQ_MODEL_NAME, GCS_URI):
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client(location=REGION, project=PROJECT_ID)

    # Create Remote Model In BigQuery
    query = f"""
    CREATE OR REPLACE MODEL `{PROJECT_ID}.{BQ_DATASET}.{BQ_MODEL_NAME}`
        OPTIONS (MODEL_TYPE='ONNX', MODEL_PATH='{GCS_URI}')
    """
    job = client.query(query = query)
    job.result()
    job.state
    
create_inference_engine_model(PROJECT_ID, REGION, BQ_DATASET, BQ_MODEL_NAME, calibration_model_uri)

# Make predictions with the imported ONNX model

In [42]:
def perform_inference_calibration_model(PROJECT_ID, REGION, BQ_DATASET, BQ_MODEL_NAME, BQ_TABLE):
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client(location=REGION, project=PROJECT_ID)

    # Create Remote Model In BigQuery
    query = f"""
    SELECT *
    FROM ML.PREDICT(MODEL {BQ_DATASET}.{BQ_MODEL_NAME},
     (
      SELECT * FROM {PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}
     )
    )
    """
    job = client.query(query = query)
    df =job.to_dataframe()
    return df
    
df = perform_inference_calibration_model(PROJECT_ID, REGION, BQ_DATASET, BQ_MODEL_NAME, BQ_TABLE)

In [43]:
df

Unnamed: 0,label,probabilities,y_test,predicted_probabilities
0,0,"[0.9070160984992981, 0.0929839015007019]",0,0.000212
1,0,"[0.9067691564559937, 0.09323084354400635]",0,0.000785
2,0,"[0.9069318771362305, 0.09306815266609192]",0,0.000408
3,0,"[0.9068508148193359, 0.09314921498298645]",0,0.000596
4,0,"[0.8072217702865601, 0.19277822971343994]",0,0.166007
...,...,...,...,...
166,1,"[0.05622914433479309, 0.9437708854675293]",1,0.999710
167,1,"[0.056327998638153076, 0.9436720013618469]",1,0.999345
168,1,"[0.08223989605903625, 0.9177601337432861]",1,0.919692
169,1,"[0.056580156087875366, 0.9434198141098022]",1,0.998417
