In [1]:
from google.cloud import bigquery
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


#### Create a master table in BigQuery that joins the transactions & identity tables
Need to have a BigQuery dataset called credit_card_fraud and the two tables called train_identity and train_transaction

In [2]:
%%bigquery
CREATE OR REPLACE TABLE credit_card_fraud.data AS

SELECT * EXCEPT(TransactionID) FROM credit_card_fraud.train_identity AS a
INNER JOIN credit_card_fraud.train_transaction AS b
ON a.TransactionID = b.TransactionID

Query complete after 0.01s: 100%|██████████| 4/4 [00:00<00:00, 1075.05query/s]                        


#### Create BigQuery tables for training and validation data

training dataset

In [3]:
%%bigquery
CREATE OR REPLACE TABLE credit_card_fraud.train AS

WITH features_table AS (
SELECT IFNULL(TransactionDT, 0) AS TransactionDT, IFNULL(TransactionAmt, 0.0) AS TransactionAmt, IFNULL(card1,0) AS card1, IFNULL(card2,0.0) AS card2, IFNULL(card3,0.0) AS card3, IFNULL(C1,0.0) AS C1, IFNULL(C2,0.0) AS C2, IFNULL(C11,0.0) AS C11, IFNULL(C12,0.0) AS C12, IFNULL(C13,0.0) AS C13, IFNULL(C14,0.0) AS C14, IFNULL(D8,0.0) AS D8, IFNULL(V45,0.0) AS V45, IFNULL(V87,0.0) AS V87, IFNULL(V258,0.0) AS V258, IFNULL(card6, "Unknown") AS card6, IFNULL(ProductCD, "Unknown") AS ProductCD, IFNULL(P_emaildomain, "Unknown") AS emaildomain,isFraud 
FROM `kubeflow-1-0-2.credit_card_fraud.data`
WHERE isFraud IS NOT NULL)

SELECT * FROM features_table AS data
WHERE MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(data))), 10) < 8

Query complete after 0.01s: 100%|██████████| 3/3 [00:00<00:00, 279.75query/s]                         


validation dataset

In [4]:
%%bigquery
CREATE OR REPLACE TABLE credit_card_fraud.validation AS

WITH features_table AS (
SELECT IFNULL(TransactionDT, 0) AS TransactionDT, IFNULL(TransactionAmt, 0.0) AS TransactionAmt, IFNULL(card1,0) AS card1, IFNULL(card2,0.0) AS card2, IFNULL(card3,0.0) AS card3, IFNULL(C1,0.0) AS C1, IFNULL(C2,0.0) AS C2, IFNULL(C11,0.0) AS C11, IFNULL(C12,0.0) AS C12, IFNULL(C13,0.0) AS C13, IFNULL(C14,0.0) AS C14, IFNULL(D8,0.0) AS D8, IFNULL(V45,0.0) AS V45, IFNULL(V87,0.0) AS V87, IFNULL(V258,0.0) AS V258, IFNULL(card6, "Unknown") AS card6, IFNULL(ProductCD, "Unknown") AS ProductCD, IFNULL(P_emaildomain, "Unknown") AS emaildomain,isFraud 
FROM `kubeflow-1-0-2.credit_card_fraud.data`
WHERE isFraud IS NOT NULL)

SELECT * FROM features_table AS data
WHERE MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(data))), 10) >= 8

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1691.02query/s]                        


#### Export the train and validation data tables in BigQuery to Parquet files in GCS

In [5]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0] 
BUCKET = PROJECT_ID
print(f"Project ID: {PROJECT_ID}")

Project ID: kubeflow-1-0-2


Data will be saved to gs://bucket/credit_card_fraud/data/train.parquet and gs://bucket/credit_card_fraud/data/eval.parquet

In [6]:
DATA_PATH = f'gs://{BUCKET}/credit_card_fraud/data/' 
TRAIN_DATA_PATH = DATA_PATH + 'train.parquet'
EVAL_DATA_PATH = DATA_PATH + 'eval.parquet'
ARTIFACT_STORE = f'gs://{PROJECT_ID}-kubeflowpipelines-default'
JOB_DIR_ROOT='{}/jobs'.format(ARTIFACT_STORE)

In [7]:
!bq extract --destination_format PARQUET $PROJECT_ID:credit_card_fraud.train $TRAIN_DATA_PATH

Waiting on bqjob_r4f6e4f4c2aa3a916_00000178fddfb5fa_1 ... (1s) Current status: DONE   


In [8]:
!bq extract --destination_format PARQUET $PROJECT_ID:credit_card_fraud.validation $EVAL_DATA_PATH

Waiting on bqjob_rfbb06ea847d6e22_00000178fddfca14_1 ... (0s) Current status: DONE   


#### We need to make sure we have the correct versions of everything installed (based on local modelling done beforehand)

In [9]:
# %%bash
# pip install --upgrade setuptools
# pip install --upgrade xgboost==1.2.1
# pip install --upgrade scikit-learn==0.23.2

In [10]:
# Restart your kernel then run this cell. These should now be 1.2.1 and 0.23.2
import xgboost
import sklearn
print(f"XGBoost Version: {xgboost.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

XGBoost Version: 1.2.1
Scikit-learn Version: 0.23.2


#### Develop the train.py file. We will do one hot encoding and scaling in this file to make sure that it happens at prediction time as well. Doing the one hot encoding **BEFORE** creating train.parquet and eval.parquet is bad practice because then we have to do those transformations on the client side at prediction time

In [11]:
!mkdir ./trainer

mkdir: cannot create directory ‘./trainer’: File exists


In [12]:
%%writefile trainer/train.py

import os
import subprocess
import sys

import fire
import pickle
import numpy as np
import pandas as pd

import hypertune

import xgboost
from xgboost import XGBClassifier
import sklearn
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from imblearn.over_sampling import SMOTENC  # SMOTE but ALSO including categorical features to oversample min class

def train_evaluate(job_dir, 
                   training_dataset_path, 
                   validation_dataset_path, 
                   max_depth,
                   hptune):
    
    
    # Get training and validation datasets (from GCS buckets)
    df_train = pd.read_parquet(training_dataset_path)
    df_validation = pd.read_parquet(validation_dataset_path)
    
    numeric_feature_indexes = list(range(0,15))
    categorical_feature_indexes = list(range(15,18))

    preprocessor = ColumnTransformer(
                                    transformers=[
                                                ('num', StandardScaler(), numeric_feature_indexes),
                                                ('cat', OneHotEncoder(), categorical_feature_indexes) 
                                    ])

    pipeline = Pipeline([
                        ('preprocessor', preprocessor),
                        ('classifier', XGBClassifier(max_depth=max_depth))
                        ])
    
    num_features_type_map = {feature: 'float64' for feature in df_train.columns[numeric_feature_indexes]}
    df_train = df_train.astype(num_features_type_map)
    df_validation = df_validation.astype(num_features_type_map)
    
    # X:y splits, for training and validation
    y_train = df_train['isFraud'] # target/label column
    X_train = df_train.drop('isFraud', 
                            axis=1, # columns
                            inplace=False) # The original dataframe df remains the same
    
    y_validation = df_validation['isFraud'] # target/label column
    X_validation = df_validation.drop('isFraud', 
                                      axis=1, # columns
                                      inplace=False) # The original dataframe df remains the same
    
    
    X_train, y_train = SMOTENC(categorical_features=categorical_feature_indexes).fit_resample(X_train, 
                                                                                              np.array(y_train))
    
    print("SMOTE complete. Begin fitting model")
    pipeline.fit(X_train, y_train)

    if hptune:
        y_pred = pipeline.predict(X_validation)
        predictions = pipeline.predict_proba(X_validation)[:, 1]
        roc_auc = roc_auc_score(y_validation,
                                predictions)
        print('ROC AUC Score: ', roc_auc)
        
        # Log it with hypertune
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(hyperparameter_metric_tag='roc_auc',  # metric to be optimized
                                                metric_value=roc_auc)
        
    # Save the model
    if not hptune:
        model_filename = 'model.pkl'
        with open(model_filename, 'wb') as model_file:
            pickle.dump(pipeline, model_file)
        gcs_model_path = "{}/{}".format(job_dir, model_filename)
        subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
        print("Saved model in: {}".format(gcs_model_path))
    
if __name__ == "__main__":
    fire.Fire(train_evaluate)

Overwriting trainer/train.py


In [13]:
# To see what versions of libraries we need to include in Dockerfile
import pandas
import sklearn
import xgboost
import imblearn

print(f"XGBoost Version: {xgboost.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")
print(f"Pandas Version: {pandas.__version__}")
print(f"imbalanced-learn Version: {imblearn.__version__}")

XGBoost Version: 1.2.1
Scikit-learn Version: 0.23.2
Pandas Version: 1.2.4
imbalanced-learn Version: 0.8.0


#### Package script into Docker Image

In [14]:
%%writefile trainer/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune imbalanced-learn scikit-learn==0.23.2 pandas==1.2.4 xgboost==1.2.1
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting trainer/Dockerfile


#### Build and push to GCR

In [15]:
IMAGE_NAME='xgboost_fraud_trainer'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)
TRAINING_APP_FOLDER='trainer'
print(f"The next cell will build and push Image to: {IMAGE_URI}")

The next cell will build and push Image to: gcr.io/kubeflow-1-0-2/xgboost_fraud_trainer:latest


In [16]:
!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

Creating temporary tarball archive of 3 file(s) totalling 4.1 KiB before compression.
Uploading tarball of [trainer] to [gs://kubeflow-1-0-2_cloudbuild/source/1619167029.703525-7251b1d08aef4920958b868c7cc528fc.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/kubeflow-1-0-2/locations/global/builds/71f8ecbf-9ea2-4ce4-8e14-9481e343d196].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/71f8ecbf-9ea2-4ce4-8e14-9481e343d196?project=9118975290].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "71f8ecbf-9ea2-4ce4-8e14-9481e343d196"

FETCHSOURCE
Fetching storage object: gs://kubeflow-1-0-2_cloudbuild/source/1619167029.703525-7251b1d08aef4920958b868c7cc528fc.tgz#1619167030047338
Copying gs://kubeflow-1-0-2_cloudbuild/source/1619167029.703525-7251b1d08aef4920958b868c7cc528fc.tgz#1619167030047338...
/ [1 files][  1.6 KiB/  1.6 KiB]                                                
Operation completed over 1 objects

#### Create hyperparam tuning yaml file

In [19]:
%%writefile trainer/hptuning_config.yaml

trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 5
    maxParallelTrials: 2
    hyperparameterMetricTag: roc_auc
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: max_depth
      type: INTEGER
      minValue: 18
      maxValue: 23
      scaleType: UNIT_LINEAR_SCALE

Overwriting trainer/hptuning_config.yaml


In [20]:
TRAIN_DATA_PATH, EVAL_DATA_PATH

('gs://kubeflow-1-0-2/credit_card_fraud/data/train.parquet',
 'gs://kubeflow-1-0-2/credit_card_fraud/data/eval.parquet')

#### Start the Hyperparameter Tuning job

In [21]:
import time
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(f'gs://{BUCKET}/credit_card_fraud/models', JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=us-central1 \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
--config trainer/hptuning_config.yaml \
--job-dir=$JOB_DIR \
-- \
--training_dataset_path=$TRAIN_DATA_PATH \
--validation_dataset_path=$EVAL_DATA_PATH \
--hptune

Job [JOB_20210423_084142] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20210423_084142

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20210423_084142
jobId: JOB_20210423_084142
state: QUEUED


#### Launch a training job with the best max_depth

In [23]:
import time
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(f'gs://{BUCKET}/credit_card_fraud/models', JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=us-central1 \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
-- \
--training_dataset_path=$TRAIN_DATA_PATH \
--validation_dataset_path=$EVAL_DATA_PATH \
--max_depth=22 \
--nohptune

Job [JOB_20210423_094843] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20210423_094843

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20210423_094843
jobId: JOB_20210423_094843
state: QUEUED


#### Deploy the model to make sure it works :-)

In [25]:
!gcloud ai-platform models create cc_fraud_classifier \
--region=us-central1

Using endpoint [https://us-central1-ml.googleapis.com/]
Created ai platform model [projects/kubeflow-1-0-2/models/cc_fraud_classifier].


In [28]:
JOB_DIR

'gs://kubeflow-1-0-2/credit_card_fraud/models/JOB_20210423_094843'

In [43]:
%%bash
gcloud alpha ai-platform versions create v1 \
--model=cc_fraud_classifier \
--origin=gs://kubeflow-1-0-2/credit_card_fraud/models/JOB_20210423_094843/ \
--framework=scikit-learn \
--python-version=3.7 \
--runtime-version=2.3 \
--region=us-central1 \
--machine-type=mls1-c1-m2 \
--package-uris=gs://kyles-public-bucket/packages/imbalanced-learn-0.8.0.tar.gz

Using endpoint [https://us-central1-ml.googleapis.com/]
ERROR: (gcloud.alpha.ai-platform.versions.create) INVALID_ARGUMENT: Machine type is not available on this endpoint.


CalledProcessError: Command 'b'gcloud alpha ai-platform versions create v1 \\\n--model=cc_fraud_classifier \\\n--origin=gs://kubeflow-1-0-2/credit_card_fraud/models/JOB_20210423_094843/ \\\n--framework=scikit-learn \\\n--python-version=3.7 \\\n--runtime-version=2.3 \\\n--region=us-central1 \\\n--machine-type=mls1-c1-m2 \\\n--package-uris=gs://kyles-public-bucket/packages/imbalanced-learn-0.8.0.tar.gz\n'' returned non-zero exit status 1.

#### Awesome! Now let's jump over to kfp_credit_card_fraud.ipynb and build a kubeflow pipeline.