In [1]:
!pip3 install google-cloud-aiplatform --user
!pip3 install pyarrow==11.0.0 --user
!pip3 install --upgrade google-cloud-bigquery --user
!pip3 install --upgrade google-cloud-bigquery-storage --user
!pip3 install --upgrade google-cloud-storage --user
!pip install db-dtypes



In [2]:
# Retrieve and set PROJECT_ID and REGION environment variables.
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
BQ_LOCATION = 'US'
REGION = 'us-west3'
MODEL_NAME

In [3]:
from google.cloud import bigquery
from google.cloud import aiplatform as vertexai
import numpy as np
import pandas as pd

In [4]:
GCS_BUCKET = f"{PROJECT_ID}-lukes_capstone"
#Make Bucket command, only needs to be run once
# !gsutil mb -l $REGION gs://$GCS_BUCKET     #Make Bucket command, only needs to be run once

In [5]:
BQ_DATASET = f"{PROJECT_ID}:luke_cap_data"
#Make Dataset command, only needs to be run once
# !bq mk --location={BQ_LOCATION} --dataset {BQ_DATASET}

In [6]:
vertexai.init(project=PROJECT_ID, location=REGION, staging_bucket=f"gs://{GCS_BUCKET}")

# Bring in features from deployed model for mapping

#### MAP ENCODINGS Learned by Model to user input features for Online Predictions

In [94]:
%%bigquery df_cat_features --project $PROJECT_ID

SELECT * 
FROM luke_cap_data.Model2_data

Query is running:   0%|          |

Downloading:   0%|          |

In [95]:
for col in df_cat_features.columns:
    print(col,":",df_cat_features[col].dtype)

User_session : object
source : object
medium : object
browser : object
os : object
deviceType : object
country : object
region : object
metro : object
city : object
domain : object
hitType : object
pageviews : Int64
timeOnSite : Int64
hitNumber : Int64
time : Int64
hour : Int64
Target : Int64
data_split : object


In [96]:
CAT_FEATURES = [col for col in df_cat_features.columns if (df_cat_features[col].dtype==object and col!= "User_session" and col!= "data_split")]
CAT_FEATURES

['source',
 'medium',
 'browser',
 'os',
 'deviceType',
 'country',
 'region',
 'metro',
 'city',
 'domain',
 'hitType']

In [97]:
from sklearn.preprocessing import OrdinalEncoder
#MAYBE OTHER ENCODERS AS WELL

In [127]:
CATEGORICAL_FEATURES = []
    
for idx, feature in enumerate(CAT_FEATURES):
    feature_vocab_file = f"gs://{GCS_BUCKET}/{MODEL_NAME}/assets/{idx}_categorical_label.txt"
    CATEGORICAL_FEATURES.append(feature_vocab_file)
print(CATEGORICAL_FEATURES)

['gs://york-bb-cohort-lukes_capstone/XGB_2/assets/0_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/1_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/2_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/3_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/4_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/5_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/6_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/7_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/8_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/9_categorical_label.txt', 'gs://york-bb-cohort-lukes_capstone/XGB_2/assets/10_categorical_label.txt']


In [128]:
 # Build categorical feature encoders for mapping text to integers for XGBoost inference. 
 #    Args:
 #      cat_feature_list (list): List of string feature names.
 #      gcs_bucket (str): A string path to your Google Cloud Storage bucket.
 #      model_name (str): A string model directory in GCS where your BQML model was exported to.
 #      na_value (str): default is 'Unknown'. String value to replace any vocab NaN values prior to encoding.
 #    Returns:
 #      feature_encoders (dict): A dictionary containing OrdinalEncoder objects for integerizing 
 #        categorical features that has the format [feature] = feature encoder.
    

feature_encoders = {}
def feature_map(feature_list):    
    for feat in CATEGORICAL_FEATURES:
        for f in CAT_FEATURES:
            feature_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
            # feature_vocab_file = f"gs://{gcs_bucket}/{model_name}/assets/{idx}_categorical_label.txt"
            feature_vocab_df = pd.read_csv(feat, delimiter = "\t", header=None).fillna('Unknown')
            feature_encoder.fit(feature_vocab_df.values)
            feature_encoders[f] = feature_encoder
    return feature_encoders  
    # return feature_encoders

{'source': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'medium': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'browser': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'os': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'deviceType': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'country': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'region': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'metro': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'city': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'domain': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
 'hitType': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)}

In [129]:
    # Transform instances to numerical values for inference.
    # Args:
    #   instances (list[dict]): A list of feature dictionaries with the format feature: value. 
    #   cat_feature_list (list): A list of string feature names.
    #   feature_encoders (dict): A dictionary with the format feature: feature_encoder.
    # Returns:
    #   transformed_instances (list[list]): A list of lists containing numerical feature values needed
    #     for Vertex XGBoost inference.
def preprocess_xgboost(instances, cat_feature_list, feature_encoders):    
    transformed_instances = []
    for instance in instances:
        for feature in cat_feature_list:
            feature_int = feature_encoders[feature].transform([[instance[feature]]]).item()
            instance[feature] = feature_int
            instance_list = list(instance.values())
        transformed_instances.append(instance_list)
    return transformed_instances

In [130]:
%%bigquery test_df --project $PROJECT_ID 

SELECT* EXCEPT (User_session, Target, data_split)
FROM luke_cap_data.Model2_data
WHERE data_split="TEST"
LIMIT 3;

Query is running:   0%|          |

Downloading:   0%|          |

In [131]:
# Convert dataframe records to feature dictionaries for preprocessing by feature name.
test_instances = test_df.astype(str).to_dict(orient='records')

In [132]:
# Apply preprocessing to transform categorical features and return numerical instances for prediction.
transformed_test_instances = preprocess_xgboost(test_instances, CAT_FEATURES, feature_encoders)

In [133]:
# Generate predictions from model deployed to Vertex AI Endpoint.
predictions = endpoint.predict(instances=transformed_test_instances)

In [134]:
for idx, prediction in enumerate(predictions.predictions):
    # Class labels [1,0] retrieved from model_metadata.json in GCS model dir.
    # BQML binary classification default is 0.5 with above "Churn" and below "Not Churn".
    add2cart = "Added" if prediction[0] >= 0.5 else "Not Added"
    print(f"Prediction: Customer {idx} - {add2cart} {prediction}")
    print(test_df.iloc[idx].astype(str).to_json() + "\n")

Prediction: Customer 0 - Not Added [0.1981358230113983, 0.8018641471862793]
{"source":"analytics.google.com","medium":"referral","browser":"Firefox","os":"Linux","deviceType":"desktop","country":"India","region":"Maharashtra","metro":"(not set)","city":"Mumbai","domain":"unknown.unknown","hitType":"PAGE","pageviews":"2","timeOnSite":"14","hitNumber":"1","time":"0","hour":"23"}

Prediction: Customer 1 - Not Added [0.2100275903940201, 0.7899724245071411]
{"source":"google.co.jp","medium":"referral","browser":"Chrome","os":"Windows","deviceType":"desktop","country":"Japan","region":"Unknown","metro":"Unknown","city":"Unknown","domain":"(not set)","hitType":"PAGE","pageviews":"2","timeOnSite":"24","hitNumber":"1","time":"0","hour":"2"}

Prediction: Customer 2 - Not Added [0.3024739325046539, 0.6975260972976685]
{"source":"(direct)","medium":"(none)","browser":"Chrome","os":"Windows","deviceType":"desktop","country":"United States","region":"California","metro":"San Francisco-Oakland-San Jo