In [1]:
from pyspark.sql import SparkSession
import sys
import re
import os
import warnings

In [2]:
spark = SparkSession.builder.appName("myapp").getOrCreate()

In [3]:
print(os.environ['SPARK_HOME'])
warnings.filterwarnings('ignore')

/opt/homebrew/Cellar/apache-spark/3.2.1/libexec


In [None]:
# Helper Method to create classification Report
def makeClassificationReport(metricsArray):
    TN = metricsArray[0][0]
    FN = metricsArray[0][1]
    FP = metricsArray[1][0]
    TP = metricsArray[1][1]
    Accuracy = (TP+TN)/(TP+FN+TN+FP)
    Precision = TP/(TP+FP)
    Recall = TP / (TP + FN)
    F1Score = 2*(Precision * Recall)/(Precision + Recall)
    print("Classification Report")
    print("Accuracy: ", Accuracy)
    print("Precision: ",Precision)
    print("Recall: ",Recall)
    print("F1- Score: ", F1Score)

## 1. READING DEMONSTRATION DATA 
- We are reading the cleaned dataset set prepared in previous Juypter Notebook
- Data is stored on S3 in following location:  s3://brfss-big-data-project/HeartRiskData/


In [7]:
# READ LOCAL DATA FILE
# Comment if reading from S3

heartData = spark.read.csv("../../../BRFSS/HeartRiskData/", header='true',inferSchema='true')

In [8]:
# READ FROM S3 BUCKET
#Comment if reading locally
# sc._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
# sc._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
# sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# heartData = spark.read.csv("s3a://brfss-big-data-project/HeartRiskData/", header = 'true',inferSchema='true')

In [9]:
from pyspark.ml.feature import VectorAssembler

numericCols = ['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income']
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
df = assembler.transform(heartData)

# We are using the 10% data that was previously reserved (same seed)
modelData, demoData = df.randomSplit([0.9, 0.1], seed = 2018)
print("Test Dataset Count: " + str(demoData.count()))

22/05/31 02:29:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 8) / 8]

Test Dataset Count: 52052


                                                                                

In [10]:
demoData.show(2)

+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+--------------------+
|HeartDisease|State|  BMI|HighBP|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender|Age|Education|Income|            features|
+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+--------------------+
|         0.0| 22.0|18.56|   0.0|     1.0|      1.0|         1.0|             1.0|   1.0|         0.0|     0.0|   0.0|       1.0|              0.0|             1.0|          4.0|

## 3. READING SAVED MODEL

In [22]:
from pyspark.ml.classification import RandomForestClassificationModel

In [23]:
from pyspark.ml.pipeline import PipelineModel
mPath ="../model/"
persistedModel = RandomForestClassificationModel.load(mPath)



##  4. Predict on the Reserved Demonstration Data

In [28]:
# predict
predictionsDF = persistedModel.transform(demoData)
predictionsDF.select("HeartDisease", "prediction").show(10)

+------------+----------+
|HeartDisease|prediction|
+------------+----------+
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       1.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       1.0|
+------------+----------+
only showing top 10 rows



## 5. FIND OUT A PERSONS HEART RISK
- Enter parameters and predict using model