# Model Development

### Objective: create a Classifier to predict Marketing Interaction Outcomes

- Train and validate initial model
- Create SparkML Pipeline and save it to Object Store

In [1]:
import os
import sys
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("ModelDevelopment")\
    .config("spark.authenticate", "true")\
    .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-1")\
    .config("spark.yarn.access.hadoopFileSystems", "s3a://demo-aws-1")\
    .config("spark.hadoop.yarn.resourcemanager.principal",os.environ["HADOOP_USER_NAME"])\
    .getOrCreate()

#.master("local[*]")\
    
# **Note:** 
# Our file isn't big, so running it in Spark local mode is fine but you can add the following config 
# if you want to run Spark on the kubernetes cluster 
# 
# > .config("spark.yarn.access.hadoopFileSystems",os.getenv['STORAGE'])\

#.config("spark.authenticate", "true") \
#    .config("spark.yarn.access.hadoopFileSystems", os.environ['STORAGE'])

print("Spark Version: {}".format(spark.version))

Py4JError: org.apache.spark.api.python.PythonUtils.isEncryptionEnabled does not exist in the JVM

In [None]:
spark.sql("SELECT * FROM DEFAULT.CUSTOMER_INTERACTIONS_CICD LIMIT 10").show()

In [None]:
hist_DF = spark.sql("SELECT * FROM DEFAULT.CUSTOMER_INTERACTIONS_CICD")

In [None]:
hist_DF.dtypes

In [None]:
df = hist_DF.select("RECENCY", "HISTORY", "USED_DISCOUNT", "USED_BOGO", "ZIP_CODE", "IS_REFERRAL", "CHANNEL", "OFFER", "SCORE", "CONVERSION")

In [None]:
#Renaming target feature as "LABEL":
df = df.withColumnRenamed("CONVERSION","label")

In [None]:
cat_cols = [item[0] for item in df.dtypes if item[1].startswith('string')]
num_cols = [item[0] for item in df.dtypes if item[1].startswith('in')]

#### Creating Pipeline
##### Notice the pipeline does not include the classifier. This is done on purpose so we can split it into two jobs.

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [None]:
def make_pipeline(df):        
    stages= []

    for col in cat_cols:

        stringIndexer = StringIndexer(inputCol = col , outputCol = col + '_StringIndex')
        encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[col + '_ClassVect'])
        stages += [stringIndexer, encoder]

    #Assembling mixed data type transformations:
    assemblerInputs = [c + "_ClassVect" for c in cat_cols] + num_cols
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

    stages += [assembler]

    #Creating and running the pipeline:
    pipeline = Pipeline(stages=stages)
    pipelineModel = pipeline.fit(df)
    out_df = pipelineModel.transform(df)
    
    return out_df, pipeline

In [None]:
out_df, pipeline = make_pipeline(df)

In [None]:
train, test = out_df.randomSplit([0.8, 0.2], seed=1)

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression(maxIter=10)

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.regParam, [0.01, 0.1]) \
    .build()

In [None]:
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(metricName="areaUnderROC"),
                          numFolds=5)

In [None]:
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)

In [None]:
print(cvModel.avgMetrics)

In [None]:
bestModel = cvModel.bestModel
print(bestModel)

#### Test Set Evaluation

In [None]:
#Evaluating model with the held out test set:
prediction = cvModel.transform(test)

In [None]:
predictionAndTarget = prediction.select("label", "prediction")

In [None]:
# Create both evaluators
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC')

In [None]:
# Get metrics
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})
auc = evaluator.evaluate(predictionAndTarget)

In [None]:
from handyspark import *
import matplotlib.pyplot as plt

In [None]:
# Handy Spark Library 
# Creates instance of extended version of BinaryClassificationMetrics
# using a DataFrame and its probability and label columns, as the output
# from the classifier
bcm = BinaryClassificationMetrics(prediction, scoreCol='probability', labelCol='label')

In [None]:
# Now we can PLOT both ROC and PR curves!
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
bcm.plot_roc_curve(ax=axs[0])
bcm.plot_pr_curve(ax=axs[1])
plt.show()

In [None]:
# And get the confusion matrix for any threshold we want
cm = bcm.print_confusion_matrix(.50)

In [None]:
from confusion_matrix_pretty_print import pretty_plot_confusion_matrix

In [None]:
pretty_plot_confusion_matrix(cm, figsize=[5,4])
plt.show()

In [None]:
#Printing metrics 
#print(acc)
#print(f1)
#print(weightedPrecision)
#print(weightedRecall)
#print(auc)

#### Saving Pipeline and Logistic Regression Model to Object Store

In [None]:
import datetime
import os, time

In [None]:
run_time_suffix = datetime.datetime.now()
run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S")

In [None]:
bestModel.write().overwrite().save("s3a://demo-aws-1/datalake/pdefusco/bestLR_{}".format(run_time_suffix))
pipeline.write().overwrite().save("s3a://demo-aws-1/datalake/pdefusco/pipeline{}".format(run_time_suffix))

In [None]:
print("s3a://demo-aws-1/datalake/pdefusco/bestLR_{}".format(run_time_suffix))

In [None]:
spark.stop()