## Hyperparamater tuning
This script does hyperparameter tuning of a Random Forest Classification model for the customer churn prediction experiment-</br>
1. Reads source data from BigQuery as a source, 
2. Writes model to GCS
3. Captures and persists model metrics to GCS and BigQuery
4. Writes model test results to BigQuery

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer
import pandas as pd
import sys, logging, argparse, random, tempfile, json
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import round as spark_round
from pyspark.sql.types import StructType, DoubleType, StringType
from pyspark.sql.functions import lit
from pathlib import Path as path
from google.cloud import storage
from urllib.parse import urlparse, urljoin
from datetime import datetime


In [2]:
spark

In [3]:
# 1a. Arguments
pipelineID = "20220813"
projectNbr = "974925525028"
projectID = "s8s-spark-ml-mlops"
displayPrintStatements = True

In [4]:
# 1b. Variables 
appBaseName = "customer-churn-model"
appNameSuffix = "hyperparameter-tuning"
appName = f"{appBaseName}-{appNameSuffix}"
modelBaseNm = appBaseName
modelVersion = pipelineID
bqDatasetNm = f"{projectID}.customer_churn_ds"
operation = appNameSuffix
bigQuerySourceTableFQN = f"{bqDatasetNm}.training_data"
bigQueryModelTestResultsTableFQN = f"{bqDatasetNm}.test_predictions"
bigQueryModelMetricsTableFQN = f"{bqDatasetNm}.model_metrics"
modelBucketUri = f"gs://s8s_model_bucket-{projectNbr}/{modelBaseNm}/{operation}/{modelVersion}"
metricsBucketUri = f"gs://s8s_metrics_bucket-{projectNbr}/{modelBaseNm}/{operation}/{modelVersion}"
scratchBucketUri = f"s8s-spark-bucket-{projectNbr}/{appBaseName}/pipelineId-{pipelineID}/{appNameSuffix}/"
pipelineExecutionDt = datetime.now().strftime("%Y%m%d%H%M%S")

In [5]:
# Other variables, constants
SPLIT_SEED = 6
SPLIT_SPECS = [0.8, 0.2]
MAX_DEPTH = [5, 10, 15]
MAX_BINS = [24, 32, 40]
N_TREES = [25, 30, 35]
N_FOLDS = 5

In [6]:
# 1c. Display input and output
if displayPrintStatements:
    print("Starting hyperparameter tuning for *Customer Churn* experiment")
    print(".....................................................")
    print(f"The datetime now is - {pipelineExecutionDt}")
    print(" ")
    print("INPUT PARAMETERS")
    print(f"....pipelineID={pipelineID}")
    print(f"....projectID={projectID}")
    print(f"....projectNbr={projectNbr}")
    print(f"....displayPrintStatements={displayPrintStatements}")
    print(" ")
    print("EXPECTED SETUP")  
    print(f"....BQ Dataset={bqDatasetNm}")
    print(f"....Model Training Source Data in BigQuery={bigQuerySourceTableFQN}")
    print(f"....Scratch Bucket for BQ connector=gs://s8s-spark-bucket-{projectNbr}") 
    print(f"....Model Bucket=gs://s8s-model-bucket-{projectNbr}")  
    print(f"....Metrics Bucket=gs://s8s-metrics-bucket-{projectNbr}") 
    print(" ")
    print("OUTPUT")
    print(f"....Model in GCS={modelBucketUri}")
    print(f"....Model metrics in GCS={metricsBucketUri}")  
    print(f"....Model metrics in BigQuery={bigQueryModelMetricsTableFQN}")      
    print(f"....Model test results in BigQuery={bigQueryModelTestResultsTableFQN}") 

Starting hyperparameter tuning for *Customer Churn* experiment
.....................................................
The datetime now is - 20220802170807
 
INPUT PARAMETERS
....pipelineID=20220813
....projectID=s8s-spark-ml-mlops
....projectNbr=974925525028
....displayPrintStatements=True
 
EXPECTED SETUP
....BQ Dataset=s8s-spark-ml-mlops.customer_churn_ds
....Model Training Source Data in BigQuery=s8s-spark-ml-mlops.customer_churn_ds.training_data
....Scratch Bucket for BQ connector=gs://s8s-spark-bucket-974925525028
....Model Bucket=gs://s8s-model-bucket-974925525028
....Metrics Bucket=gs://s8s-metrics-bucket-974925525028
 
OUTPUT
....Model in GCS=gs://s8s_model_bucket-974925525028/customer-churn-model/hyperparameter-tuning/20220813
....Model metrics in GCS=gs://s8s_metrics_bucket-974925525028/customer-churn-model/hyperparameter-tuning/20220813
....Model metrics in BigQuery=s8s-spark-ml-mlops.customer_churn_ds.model_metrics
....Model test results in BigQuery=s8s-spark-ml-mlops.custom

In [7]:
# 2. Spark config
print('....Setting Spark config')
spark = SparkSession.builder.appName(appName).getOrCreate()
# Spark configuration setting for writes to BigQuery
spark.conf.set("parentProject", projectID)
spark.conf.set("temporaryGcsBucket", scratchBucketUri)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Add Python modules
sc.addPyFile(f"gs://s8s_code_bucket-{projectNbr}/pyspark/common_utils.py")
import common_utils

....Setting Spark config


In [8]:
# 3. Pre-process training data
print('....Data pre-procesing')
dataPreprocessingStagesList = []
# 3a. Create and append to pipeline stages - string indexing and one hot encoding
for eachCategoricalColumn in common_utils.CATEGORICAL_COLUMN_LIST:
    # Category indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=eachCategoricalColumn, outputCol=eachCategoricalColumn + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[eachCategoricalColumn + "classVec"])
    # Add stages.  This is a lazy operation
    dataPreprocessingStagesList += [stringIndexer, encoder]

# 3b. Convert label into label indices using the StringIndexer and append to pipeline stages
labelStringIndexer = StringIndexer(inputCol="churn", outputCol="label")
dataPreprocessingStagesList += [labelStringIndexer]

....Data pre-procesing


In [9]:
# 4. Feature engineering
print('....Feature engineering')
featureEngineeringStageList = []
assemblerInputs = common_utils.NUMERIC_COLUMN_LIST + [c + "classVec" for c in common_utils.CATEGORICAL_COLUMN_LIST]
featuresVectorAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
featureEngineeringStageList += [featuresVectorAssembler]

....Feature engineering


In [10]:
# 5. Model training
print('....Model training')
modelTrainingStageList = []
rfClassifier = RandomForestClassifier(labelCol="label", featuresCol="features")
modelTrainingStageList += [rfClassifier]

....Model training


In [11]:
# 6. Create a model training pipeline for stages defined
print('....Instantiating pipeline model')
pipeline = Pipeline(stages=dataPreprocessingStagesList + featureEngineeringStageList + modelTrainingStageList)   

....Instantiating pipeline model


In [12]:
# 7. Hyperparameter tuning & cross validation
print('....Hyperparameter tuning & cross validation')
parameterGrid = (ParamGridBuilder()
               .addGrid(modelTrainingStageList[0].maxDepth, MAX_DEPTH)
               .addGrid(modelTrainingStageList[0].maxBins, MAX_BINS)
               .addGrid(modelTrainingStageList[0].numTrees, N_TREES)
               .build())

evaluator = BinaryClassificationEvaluator(labelCol="label")
crossValidatorPipeline = CrossValidator(estimator=pipeline,
                                 estimatorParamMaps=parameterGrid,
                                 evaluator=evaluator,
                                 numFolds=N_FOLDS)

....Hyperparameter tuning & cross validation


In [13]:
# 8. Read training data
print('....Reading training dataset')
inputDF = spark.read \
    .format('bigquery') \
    .load(bigQuerySourceTableFQN)


# Typecast some columns to the right datatype
inputDF = inputDF.withColumn("partner", inputDF.partner.cast('string')) \
    .withColumn("dependents", inputDF.dependents.cast('string')) \
    .withColumn("phone_service", inputDF.phone_service.cast('string')) \
    .withColumn("paperless_billing", inputDF.paperless_billing.cast('string')) \
    .withColumn("churn", inputDF.churn.cast('string')) \
    .withColumn("monthly_charges", inputDF.monthly_charges.cast('float')) \
    .withColumn("total_charges", inputDF.total_charges.cast('float'))

....Reading training dataset


In [14]:
# 9. Split to training and test datasets
print('....Splitting the dataset')
trainDF, testDF = inputDF.randomSplit(SPLIT_SPECS, seed=SPLIT_SEED)

....Splitting the dataset


In [15]:
# 10. Fit the model; Takes tens of minutes, repartition as needed
trainDF.repartition(300)
pipelineModel = crossValidatorPipeline.fit(trainDF)

22/08/02 17:09:55 WARN DAGScheduler: Broadcasting large task binary with size 1028.7 KiB
22/08/02 17:09:55 WARN DAGScheduler: Broadcasting large task binary with size 1350.0 KiB
22/08/02 17:09:56 WARN DAGScheduler: Broadcasting large task binary with size 1802.4 KiB
22/08/02 17:09:56 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/08/02 17:09:58 WARN DAGScheduler: Broadcasting large task binary with size 1682.9 KiB
22/08/02 17:10:03 WARN DAGScheduler: Broadcasting large task binary with size 1124.1 KiB
22/08/02 17:10:04 WARN DAGScheduler: Broadcasting large task binary with size 1529.0 KiB
22/08/02 17:10:04 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
22/08/02 17:10:05 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
22/08/02 17:10:07 WARN DAGScheduler: Broadcasting large task binary with size 1955.3 KiB
22/08/02 17:10:12 WARN DAGScheduler: Broadcasting large task binary with size 1206.5 KiB
22/08/02 17:10:13 WARN DAGSche

In [16]:
# 11. Persist model to GCS
pipelineModel.write().overwrite().save(modelBucketUri)

                                                                                

In [17]:
# 12. Test the model with the test dataset
print('....Testing the model')
predictionsDF = pipelineModel.transform(testDF)
predictionsDF.show(2)

....Testing the model


22/08/02 17:32:21 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----------+------+--------------+-------+----------+------+------------+-------------+--------------+----------------+---------------+-------------+-----------------+------------+------------+----------------+--------------+-----------------+----------------+---------------+-------------+-----+-----------+---------------------+-----------+--------------+-------------------+----------------------+------------+---------------+---------------+------------------+------------------+---------------------+-------------------+----------------------+---------------------+------------------------+--------------------+-----------------------+------------------+---------------------+----------------------+-------------------------+-----------------+--------------------+-----------------+--------------------+---------------------+------------------------+-------------+----------------+----------------------+-------------------------+-------------------+----------------------+-----+---------------

                                                                                

In [18]:
# 13. Persist model testing results to BigQuery
predictionsWithPipelineIdDF = predictionsDF.withColumn("pipeline_id", lit(pipelineID)) \
                                   .withColumn("model_version", lit(pipelineID)) \
                                   .withColumn("pipeline_execution_dt", lit(pipelineExecutionDt)) \
                                   .withColumn("operation", lit(operation)) 

predictionsWithPipelineIdDF.write.format('bigquery') \
.mode("overwrite")\
.option('table', bigQueryModelTestResultsTableFQN) \
.save()

                                                                                

In [19]:
def fnCaptureModelMetrics(predictionsDF, labelColumn, operation):
    """
    Get model metrics
    Args:
        predictions: predictions
        labelColumn: target column
        operation: train or test
    Returns:
        metrics: metrics
        
    Anagha TODO: This function if called from common_utils fails; Need to researchy why
    """
    
    metricLabels = ['area_roc', 'area_prc', 'accuracy', 'f1', 'precision', 'recall']
    metricColumns = ['true', 'score', 'prediction']
    metricKeys = [f'{operation}_{ml}' for ml in metricLabels] + metricColumns

    # Instantiate evaluators
    bcEvaluator = BinaryClassificationEvaluator(labelCol=labelColumn)
    mcEvaluator = MulticlassClassificationEvaluator(labelCol=labelColumn)

    # Capture metrics -> areas, acc, f1, prec, rec
    area_roc = round(bcEvaluator.evaluate(predictionsDF, {bcEvaluator.metricName: 'areaUnderROC'}), 5)
    area_prc = round(bcEvaluator.evaluate(predictionsDF, {bcEvaluator.metricName: 'areaUnderPR'}), 5)
    acc = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "accuracy"}), 5)
    f1 = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "f1"}), 5)
    prec = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "weightedPrecision"}), 5)
    rec = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "weightedRecall"}), 5)

    # Get the true, score, prediction off of the test results dataframe
    rocDictionary = common_utils.fnGetTrueScoreAndPrediction(predictionsDF, labelColumn)
    true = rocDictionary['true']
    score = rocDictionary['score']
    prediction = rocDictionary['prediction']

    # Create a metric values array
    metricValuesArray = [] 
    metricValuesArray.extend((area_roc, area_prc, acc, f1, prec, rec))
    #metricValuesArray.extend((area_roc, area_prc, acc, f1, prec, rec, true, score, prediction))
    
    # Zip the keys and values into a dictionary  
    metricsDictionary = dict(zip(metricKeys, metricValuesArray))

    return metricsDictionary
# }} End fnCaptureModelmetrics

In [20]:
# 14. Capture & display metrics
hyperParameterTunedModelMetrics = fnCaptureModelMetrics(predictionsDF, "label", "test")
for m, v in hyperParameterTunedModelMetrics.items():
    print(f'{m}: {v}')

                                                                                

+----+-------+----------+
|true|  score|prediction|
+----+-------+----------+
| 1.0|0.55539|       1.0|
| 1.0| 0.5161|       1.0|
+----+-------+----------+
only showing top 2 rows



[Stage 11616:>                                                      (0 + 1) / 1]

test_area_roc: 0.84965
test_area_prc: 0.64959
test_accuracy: 0.82057
test_f1: 0.80615
test_precision: 0.80952
test_recall: 0.82057


                                                                                

In [21]:
# 15. Persist metrics to BigQuery
metricsDF = spark.createDataFrame(hyperParameterTunedModelMetrics.items(), ["metric_nm", "metric_value"]) 
metricsWithPipelineIdDF = metricsDF.withColumn("pipeline_id", lit(pipelineID)) \
                                   .withColumn("model_version", lit(pipelineID)) \
                                   .withColumn("pipeline_execution_dt", lit(pipelineExecutionDt)) \
                                   .withColumn("operation", lit(operation)) 

metricsWithPipelineIdDF.show()
metricsWithPipelineIdDF.write.format('bigquery') \
.mode("overwrite")\
.option('table', bigQueryModelMetricsTableFQN) \
.save()


                                                                                

+--------------+------------+-----------+-------------+---------------------+--------------------+
|     metric_nm|metric_value|pipeline_id|model_version|pipeline_execution_dt|           operation|
+--------------+------------+-----------+-------------+---------------------+--------------------+
| test_area_roc|     0.84965|   20220813|     20220813|       20220802170807|hyperparameter-tu...|
| test_area_prc|     0.64959|   20220813|     20220813|       20220802170807|hyperparameter-tu...|
| test_accuracy|     0.82057|   20220813|     20220813|       20220802170807|hyperparameter-tu...|
|       test_f1|     0.80615|   20220813|     20220813|       20220802170807|hyperparameter-tu...|
|test_precision|     0.80952|   20220813|     20220813|       20220802170807|hyperparameter-tu...|
|   test_recall|     0.82057|   20220813|     20220813|       20220802170807|hyperparameter-tu...|
+--------------+------------+-----------+-------------+---------------------+--------------------+



                                                                                

In [22]:
# 16. Persist metrics to GCS
blobName = f"{modelBaseNm}/{operation}/{modelVersion}/metrics.json"
common_utils.fnPersistMetrics(urlparse(metricsBucketUri).netloc, hyperParameterTunedModelMetrics, blobName)


22/08/02 17:34:06 ERROR AsyncEventQueue: Listener EventLoggingListener threw an exception
java.lang.NullPointerException
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.getGcsFs(GoogleHadoopFileSystemBase.java:1274)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopSyncableOutputStream.commitCurrentFile(GoogleHadoopSyncableOutputStream.java:335)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopSyncableOutputStream.hsyncInternal(GoogleHadoopSyncableOutputStream.java:286)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopSyncableOutputStream.hflush(GoogleHadoopSyncableOutputStream.java:259)
	at org.apache.hadoop.fs.FSDataOutputStream.hflush(FSDataOutputStream.java:136)
	at org.apache.spark.deploy.history.EventLogFileWriter.$anonfun$writeLine$3(EventLogFileWriters.scala:122)
	at org.apache.spark.deploy.history.EventLogFileWriter.$anonfun$writeLine$3$adapted(EventLogFileWriters.scala:122)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.deploy.history.EventLogFileWrite