## Continued from exploration.ipynb

In [1]:
from google.cloud import bigquery
from google.cloud import aiplatform as vertexai
import numpy as np
import pandas as pd
from google.cloud.bigquery import Client, QueryJobConfig
client = Client()

In [2]:
PROJECT_ID = 'york-bb-cohort'
BQ_LOCATION = 'US'
REGION = 'us-east1'
GCS_BUCKET = f"{PROJECT_ID}-akane_bucket"
# Bucket created via console
BQ_DATASET = f"{PROJECT_ID}:akane_bqds"

### Create prototype training table

#### Other changes made to training set
* Re-added sku as it seems important and could be used in place of the various pagePathLevel fields. To prevent duplicate rows I only retrieved values for this field where the length of the product array was 1. Hoping label encoding or something better is possible here
* Encoded explicit label column: 1 whenever hits.eCommerceAction.action_type is 3, 0 otherwise
* Dropped all rows that are bounces to slightly improve the class imbalance; may not be significant
* Modified 'country', 'browser', 'source', 'operatingSystem', and 'networkDomain' to keep most common values (all having over 20,000 rows or 10 most common, whichever is fewer) and set all other values to 'other' to control dimensionality (assuming model algorithm might one-hot these fields)

In [None]:
%%bigquery --project ${PROJECT_ID}

CREATE OR REPLACE TABLE akane_bqds.ga4_training_att2 AS (
  SELECT
    visitNumber, 
    (SELECT CASE WHEN ARRAY_LENGTH(h.product) = 1 THEN MAX(productSku) ELSE 'none_or_many' END FROM UNNEST(h.product)) AS sku,
    IF(CAST(h.eCommerceAction.action_type AS INT) = 3,1,0) AS label,    
    visitStartTime, 
    CAST(date AS INT) AS date, 
    CASE 
      WHEN geoNetwork.networkDomain IN ('(not set)', 'unknown.unknown', 'comcast.net', 'verizon.net', 'rr.com', 'comcastbusiness.net') 
      THEN geoNetwork.networkDomain
      ELSE 'other'
    END AS networkDomain,
    CASE 
      WHEN geoNetwork.country IN ('United States', 'Canada', 'India', 'United Kingdom', 'Germany', 'Taiwan', 'Japan', 'Australia', 'France', 'Spain') 
      THEN geoNetwork.country
      ELSE 'other'
    END AS country,
    CASE 
      WHEN device.operatingSystem IN ('Windows', 'Macintosh', 'Android', 'iOS', 'Chrome OS', 'Linux') 
      THEN device.operatingSystem
      ELSE 'other'
    END AS operatingSystem,
    CASE 
      WHEN device.browser IN ('Chrome', 'Safari', 'Firefox', 'Internet Explorer', 'Android Webview') 
      THEN device.browser
      ELSE 'other'
    END AS browser,
    CASE 
      WHEN trafficSource.source IN ('google', '(direct)', 'youtube.com', 'Partners') 
      THEN trafficSource.source
      ELSE 'other'
    END AS source,
    channelGrouping, 
    h.hitNumber, 
    h.time, 
    h.isEntrance, 
    h.isExit, 
    h.type, 
    h.eCommerceAction.step, 
    h.social.socialNetwork, 
    h.social.hasSocialSourceReferral, 
    h.contentGroup.contentGroup1, 
    h.contentGroup.contentGroup2, 
    h.contentGroup.contentGroup3, 
    h.contentGroup.previousContentGroup1, 
    h.contentGroup.previousContentGroup2, 
    h.contentGroup.previousContentGroup3, 
    h.contentGroup.previousContentGroup4, 
    h.contentGroup.previousContentGroup5, 
    trafficSource.campaign, 
    trafficSource.medium, 
    totals.hits, 
    totals.pageviews, 
    totals.timeOnSite, 
    totals.newVisits, 
    device.isMobile, 
    device.deviceCategory, 
    FROM `bigquery-public-data.google_analytics_sample.ga_sessions_201703*`,
    UNNEST(hits) AS h
    WHERE totals.bounces IS NULL
)

### Create BigQuery ML model

### Model 1: Boosted tree classifier

In [None]:
CREATE OR REPLACE MODEL akane_bqds.ak_bqmodel1
OPTIONS(
  MODEL_TYPE="BOOSTED_TREE_CLASSIFIER",
  INPUT_LABEL_COLS=['label'],
  CATEGORY_ENCODING_METHOD='LABEL_ENCODING',
  DATA_SPLIT_METHOD='AUTO_SPLIT',
  HPARAM_TUNING_ALGORITHM='VIZIER_DEFAULT',
  HPARAM_TUNING_OBJECTIVES=['roc_auc'],
  EARLY_STOP=True,
  num_trials=8,
  max_parallel_trials=4
) AS

SELECT * FROM akane_bqds.ga4_training_att2

### Model 2: Weighted boosted tree classifier

In [None]:
CREATE OR REPLACE MODEL akane_bqds.ak_bqmodel2
OPTIONS(
  MODEL_TYPE="BOOSTED_TREE_CLASSIFIER",
  INPUT_LABEL_COLS=['label'],
  CATEGORY_ENCODING_METHOD='LABEL_ENCODING',
  DATA_SPLIT_METHOD='AUTO_SPLIT',
  AUTO_CLASS_WEIGHTS=True,
  HPARAM_TUNING_ALGORITHM='VIZIER_DEFAULT',
  HPARAM_TUNING_OBJECTIVES=['roc_auc'],
  EARLY_STOP=True,
  num_trials=8,
  max_parallel_trials=4
) AS

SELECT * FROM akane_bqds.ga4_training_att2

### Model 3: Weighted logistic regression classifier

In [None]:
CREATE OR REPLACE MODEL akane_bqds.ak_lrmodel2
OPTIONS(
  MODEL_TYPE="LOGISTIC_REG",
  INPUT_LABEL_COLS=['label'],
  DATA_SPLIT_METHOD='AUTO_SPLIT',
  AUTO_CLASS_WEIGHTS=True,
  HPARAM_TUNING_ALGORITHM='VIZIER_DEFAULT',
  HPARAM_TUNING_OBJECTIVES=['roc_auc'],
  EARLY_STOP=True,
  num_trials=8,
  max_parallel_trials=4
) AS

SELECT * EXCEPT(sku) FROM akane_bqds.ga4_training_att2

### Model 4: Weighted logistic regression classifier, double trials

In [None]:
CREATE OR REPLACE MODEL akane_bqds.ak_lrmodel1
OPTIONS(
  MODEL_TYPE="LOGISTIC_REG",
  INPUT_LABEL_COLS=['label'],
  DATA_SPLIT_METHOD='AUTO_SPLIT',
  AUTO_CLASS_WEIGHTS=True,
  HPARAM_TUNING_ALGORITHM='VIZIER_DEFAULT',
  HPARAM_TUNING_OBJECTIVES=['roc_auc'],
  EARLY_STOP=True,
  num_trials=16,
  max_parallel_trials=2
) AS

SELECT * EXCEPT(sku) FROM akane_bqds.ga4_training_att2