## Hyperparamater tuning
This script does hyperparameter tuning of a Random Forest Classification model for the customer churn prediction experiment-</br>
1. Reads source data from BigQuery as a source, 
2. Writes model to GCS
3. Captures and persists model metrics to GCS and BigQuery
4. Writes model test results to BigQuery

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer
import pandas as pd
import sys, logging, argparse, random, tempfile, json
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import round as spark_round
from pyspark.sql.types import StructType, DoubleType, StringType
from pyspark.sql.functions import lit
from pathlib import Path as path
from google.cloud import storage
from urllib.parse import urlparse, urljoin
from datetime import datetime


In [None]:
spark

In [None]:
# 1a. Arguments
pipelineID = "20220813"
projectNbr = "974925525028"
projectID = "s8s-spark-ml-mlops"
displayPrintStatements = True

In [None]:
# 1b. Variables 
appBaseName = "customer-churn-model"
appNameSuffix = "hyperparameter-tuning"
appName = f"{appBaseName}-{appNameSuffix}"
modelBaseNm = appBaseName
modelVersion = pipelineID
bqDatasetNm = f"{projectID}.customer_churn_ds"
operation = appNameSuffix
bigQuerySourceTableFQN = f"{bqDatasetNm}.training_data"
bigQueryModelTestResultsTableFQN = f"{bqDatasetNm}.test_predictions"
bigQueryModelMetricsTableFQN = f"{bqDatasetNm}.model_metrics"
modelBucketUri = f"gs://s8s_model_bucket-{projectNbr}/{modelBaseNm}/{operation}/{modelVersion}"
metricsBucketUri = f"gs://s8s_metrics_bucket-{projectNbr}/{modelBaseNm}/{operation}/{modelVersion}"
scratchBucketUri = f"s8s-spark-bucket-{projectNbr}/{appBaseName}/pipelineId-{pipelineID}/{appNameSuffix}/"
pipelineExecutionDt = datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
# Other variables, constants
SPLIT_SEED = 6
SPLIT_SPECS = [0.8, 0.2]
MAX_DEPTH = [5, 10, 15]
MAX_BINS = [24, 32, 40]
N_TREES = [25, 30, 35]
N_FOLDS = 5

In [None]:
# 1c. Display input and output
if displayPrintStatements:
    print("Starting hyperparameter tuning for *Customer Churn* experiment")
    print(".....................................................")
    print(f"The datetime now is - {pipelineExecutionDt}")
    print(" ")
    print("INPUT PARAMETERS")
    print(f"....pipelineID={pipelineID}")
    print(f"....projectID={projectID}")
    print(f"....projectNbr={projectNbr}")
    print(f"....displayPrintStatements={displayPrintStatements}")
    print(" ")
    print("EXPECTED SETUP")  
    print(f"....BQ Dataset={bqDatasetNm}")
    print(f"....Model Training Source Data in BigQuery={bigQuerySourceTableFQN}")
    print(f"....Scratch Bucket for BQ connector=gs://s8s-spark-bucket-{projectNbr}") 
    print(f"....Model Bucket=gs://s8s-model-bucket-{projectNbr}")  
    print(f"....Metrics Bucket=gs://s8s-metrics-bucket-{projectNbr}") 
    print(" ")
    print("OUTPUT")
    print(f"....Model in GCS={modelBucketUri}")
    print(f"....Model metrics in GCS={metricsBucketUri}")  
    print(f"....Model metrics in BigQuery={bigQueryModelMetricsTableFQN}")      
    print(f"....Model test results in BigQuery={bigQueryModelTestResultsTableFQN}") 

In [None]:
# 2. Spark config
print('....Setting Spark config')
spark = SparkSession.builder.appName(appName).getOrCreate()
# Spark configuration setting for writes to BigQuery
spark.conf.set("parentProject", projectID)
spark.conf.set("temporaryGcsBucket", scratchBucketUri)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Add Python modules
sc.addPyFile(f"gs://s8s_code_bucket-{projectNbr}/pyspark/common_utils.py")
import common_utils

In [None]:
# 3. Pre-process training data
print('....Data pre-procesing')
dataPreprocessingStagesList = []
# 3a. Create and append to pipeline stages - string indexing and one hot encoding
for eachCategoricalColumn in common_utils.CATEGORICAL_COLUMN_LIST:
    # Category indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=eachCategoricalColumn, outputCol=eachCategoricalColumn + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[eachCategoricalColumn + "classVec"])
    # Add stages.  This is a lazy operation
    dataPreprocessingStagesList += [stringIndexer, encoder]

# 3b. Convert label into label indices using the StringIndexer and append to pipeline stages
labelStringIndexer = StringIndexer(inputCol="churn", outputCol="label")
dataPreprocessingStagesList += [labelStringIndexer]

In [None]:
# 4. Feature engineering
print('....Feature engineering')
featureEngineeringStageList = []
assemblerInputs = common_utils.NUMERIC_COLUMN_LIST + [c + "classVec" for c in common_utils.CATEGORICAL_COLUMN_LIST]
featuresVectorAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
featureEngineeringStageList += [featuresVectorAssembler]

In [None]:
# 5. Model training
print('....Model training')
modelTrainingStageList = []
rfClassifier = RandomForestClassifier(labelCol="label", featuresCol="features")
modelTrainingStageList += [rfClassifier]

In [None]:
# 6. Create a model training pipeline for stages defined
print('....Instantiating pipeline model')
pipeline = Pipeline(stages=dataPreprocessingStagesList + featureEngineeringStageList + modelTrainingStageList)   

In [None]:
# 7. Hyperparameter tuning & cross validation
print('....Hyperparameter tuning & cross validation')
parameterGrid = (ParamGridBuilder()
               .addGrid(modelTrainingStageList[0].maxDepth, MAX_DEPTH)
               .addGrid(modelTrainingStageList[0].maxBins, MAX_BINS)
               .addGrid(modelTrainingStageList[0].numTrees, N_TREES)
               .build())

evaluator = BinaryClassificationEvaluator(labelCol="label")
crossValidatorPipeline = CrossValidator(estimator=pipeline,
                                 estimatorParamMaps=parameterGrid,
                                 evaluator=evaluator,
                                 numFolds=N_FOLDS)

In [None]:
# 8. Read training data
print('....Reading training dataset')
inputDF = spark.read \
    .format('bigquery') \
    .load(bigQuerySourceTableFQN)


# Typecast some columns to the right datatype
inputDF = inputDF.withColumn("partner", inputDF.partner.cast('string')) \
    .withColumn("dependents", inputDF.dependents.cast('string')) \
    .withColumn("phone_service", inputDF.phone_service.cast('string')) \
    .withColumn("paperless_billing", inputDF.paperless_billing.cast('string')) \
    .withColumn("churn", inputDF.churn.cast('string')) \
    .withColumn("monthly_charges", inputDF.monthly_charges.cast('float')) \
    .withColumn("total_charges", inputDF.total_charges.cast('float'))

In [None]:
# 9. Split to training and test datasets
print('....Splitting the dataset')
trainDF, testDF = inputDF.randomSplit(SPLIT_SPECS, seed=SPLIT_SEED)

In [None]:
# 10. Fit the model; Takes tens of minutes, repartition as needed
trainDF.repartition(300)
pipelineModel = crossValidatorPipeline.fit(trainDF)

In [None]:
# 11. Persist model to GCS
pipelineModel.write().overwrite().save(modelBucketUri)

In [None]:
# 12. Test the model with the test dataset
print('....Testing the model')
predictionsDF = pipelineModel.transform(testDF)
predictionsDF.show(2)

In [None]:
# 13. Persist model testing results to BigQuery
predictionsWithPipelineIdDF = predictionsDF.withColumn("pipeline_id", lit(pipelineID)) \
                                   .withColumn("model_version", lit(pipelineID)) \
                                   .withColumn("pipeline_execution_dt", lit(pipelineExecutionDt)) \
                                   .withColumn("operation", lit(operation)) 

predictionsWithPipelineIdDF.write.format('bigquery') \
.mode("overwrite")\
.option('table', bigQueryModelTestResultsTableFQN) \
.save()

In [None]:
def fnCaptureModelMetrics(predictionsDF, labelColumn, operation):
    """
    Get model metrics
    Args:
        predictions: predictions
        labelColumn: target column
        operation: train or test
    Returns:
        metrics: metrics
        
    Anagha TODO: This function if called from common_utils fails; Need to researchy why
    """
    
    metricLabels = ['area_roc', 'area_prc', 'accuracy', 'f1', 'precision', 'recall']
    metricColumns = ['true', 'score', 'prediction']
    metricKeys = [f'{operation}_{ml}' for ml in metricLabels] + metricColumns

    # Instantiate evaluators
    bcEvaluator = BinaryClassificationEvaluator(labelCol=labelColumn)
    mcEvaluator = MulticlassClassificationEvaluator(labelCol=labelColumn)

    # Capture metrics -> areas, acc, f1, prec, rec
    area_roc = round(bcEvaluator.evaluate(predictionsDF, {bcEvaluator.metricName: 'areaUnderROC'}), 5)
    area_prc = round(bcEvaluator.evaluate(predictionsDF, {bcEvaluator.metricName: 'areaUnderPR'}), 5)
    acc = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "accuracy"}), 5)
    f1 = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "f1"}), 5)
    prec = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "weightedPrecision"}), 5)
    rec = round(mcEvaluator.evaluate(predictionsDF, {mcEvaluator.metricName: "weightedRecall"}), 5)

    # Get the true, score, prediction off of the test results dataframe
    rocDictionary = common_utils.fnGetTrueScoreAndPrediction(predictionsDF, labelColumn)
    true = rocDictionary['true']
    score = rocDictionary['score']
    prediction = rocDictionary['prediction']

    # Create a metric values array
    metricValuesArray = [] 
    metricValuesArray.extend((area_roc, area_prc, acc, f1, prec, rec))
    #metricValuesArray.extend((area_roc, area_prc, acc, f1, prec, rec, true, score, prediction))
    
    # Zip the keys and values into a dictionary  
    metricsDictionary = dict(zip(metricKeys, metricValuesArray))

    return metricsDictionary
# }} End fnCaptureModelmetrics

In [None]:
# 14. Capture & display metrics
hyperParameterTunedModelMetrics = fnCaptureModelMetrics(predictionsDF, "label", "test")
for m, v in hyperParameterTunedModelMetrics.items():
    print(f'{m}: {v}')

In [None]:
# 15. Persist metrics to BigQuery
metricsDF = spark.createDataFrame(hyperParameterTunedModelMetrics.items(), ["metric_nm", "metric_value"]) 
metricsWithPipelineIdDF = metricsDF.withColumn("pipeline_id", lit(pipelineID)) \
                                   .withColumn("model_version", lit(pipelineID)) \
                                   .withColumn("pipeline_execution_dt", lit(pipelineExecutionDt)) \
                                   .withColumn("operation", lit(operation)) 

metricsWithPipelineIdDF.show()
metricsWithPipelineIdDF.write.format('bigquery') \
.mode("overwrite")\
.option('table', bigQueryModelMetricsTableFQN) \
.save()


In [None]:
# 16. Persist metrics to GCS
blobName = f"{modelBaseNm}/{operation}/{modelVersion}/metrics.json"
common_utils.fnPersistMetrics(urlparse(metricsBucketUri).netloc, hyperParameterTunedModelMetrics, blobName)
