In [1]:
from pyspark.sql import SparkSession
import sys
import re
import os
import warnings

In [2]:
spark = SparkSession.builder.appName("myapp").getOrCreate()

In [3]:
print(os.environ['SPARK_HOME'])
warnings.filterwarnings('ignore')

/opt/homebrew/Cellar/apache-spark/3.2.1/libexec


In [4]:
# Helper Method to create classification Report
def makeClassificationReport(metricsArray):
    TN = metricsArray[0][0]
    FN = metricsArray[1][0]
    FP = metricsArray[0][1]
    TP = metricsArray[1][1]
    Accuracy = (TP+TN)/(TP+FN+TN+FP)
    Precision = TP/(TP+FP)
    Recall = TP / (TP + FN)
    F1Score = 2*(Precision * Recall)/(Precision + Recall)
    print("Classification Report")
    print("Accuracy: ", Accuracy)
    print("Precision: ",Precision)
    print("Recall: ",Recall)
    print("F1- Score: ", F1Score)

## 1. READING DEMONSTRATION DATA 
- We are reading the cleaned dataset set prepared in previous Juypter Notebook
- Data is stored on S3 in following location:  s3://brfss-big-data-project/HeartRiskData/


In [5]:
# READ LOCAL DATA FILE
# Comment if reading from S3

# heartData = spark.read.csv("../../../BRFSS/HeartRiskData/", header='true',inferSchema='true')

In [6]:
# READ FROM S3 BUCKET
#Comment if reading locally
sc._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

heartData = spark.read.csv("s3a://brfss-big-data-project/HeartRiskData/", header = 'true',inferSchema='true')

22/06/02 18:55:57 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [7]:
from pyspark.ml.feature import VectorAssembler

numericCols = ['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income']
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
df = assembler.transform(heartData)

# We are using the 10% data that was previously reserved (same seed)
modelData, demoData = df.randomSplit([0.9, 0.1], seed = 2018)
print("Demonstration Dataset Count: " + str(demoData.count()))

22/06/02 18:56:14 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

Demonstration Dataset Count: 52052


                                                                                

In [8]:
df.show(1)

+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+--------------------+
|HeartDisease|State|  BMI|HighBP|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender|Age|Education|Income|            features|
+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+--------------------+
|         0.0| 22.0|23.91|   0.0|     0.0|      1.0|         1.0|             1.0|   1.0|         0.0|     0.0|   0.0|       0.0|              0.0|             1.0|          5.0|

In [9]:
df.printSchema()

root
 |-- HeartDisease: double (nullable = true)
 |-- State: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- HighBP: double (nullable = true)
 |-- HighChol: double (nullable = true)
 |-- CholCheck: double (nullable = true)
 |-- FruitConsume: double (nullable = true)
 |-- VegetableConsume: double (nullable = true)
 |-- Smoker: double (nullable = true)
 |-- HeavyDrinker: double (nullable = true)
 |-- Diabetes: double (nullable = true)
 |-- Stroke: double (nullable = true)
 |-- Healthcare: double (nullable = true)
 |-- NoDoctorDueToCost: double (nullable = true)
 |-- PhysicalActivity: double (nullable = true)
 |-- GeneralHealth: double (nullable = true)
 |-- PhysicalHealth: double (nullable = true)
 |-- MentalHealth: double (nullable = true)
 |-- DifficultyWalking: double (nullable = true)
 |-- Gender: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Education: double (nullable = true)
 |-- Income: double (nullable = true)
 |-- features: vector (nullable =

## 3. READING SAVED MODEL

In [10]:
from pyspark.ml.classification import RandomForestClassificationModel

In [11]:
# READ LOCAL MODEL
# Comment if Reading from S3
# mPath = "../model/"
# persistedModel = RandomForestClassificationModel.load(mPath)

In [12]:
# READ MODEL FROM S3
# Comment if reading locally stored model
mPath = "s3a://brfss-big-data-project/model/"
persistedModel = RandomForestClassificationModel.load(mPath)


                                                                                

##  4. MAKING PREDICTIONS ON UNSEEN DEMONSTRATION DATA

In [13]:
# predict
predictions = persistedModel.transform(demoData)
predictions.select("HeartDisease", "prediction").show(10)

[Stage 14:>                                                         (0 + 1) / 1]

+------------+----------+
|HeartDisease|prediction|
+------------+----------+
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       1.0|
+------------+----------+
only showing top 10 rows



                                                                                

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))



Accuracy = 0.7861604556281628
Test Error = 0.21383954437183716




In [15]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','HeartDisease']).withColumn('HeartDisease', F.col('HeartDisease').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','HeartDisease'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print("Confusuion Matrix:")
print(metrics.confusionMatrix().toArray())

                                                                                

Confusuion Matrix:
[[34303. 12882.]
 [ 1076.  3791.]]


In [16]:
makeClassificationReport(metrics.confusionMatrix().toArray())

Classification Report
Accuracy:  0.7318450779989242
Precision:  0.22737359803274756
Recall:  0.7789192521060201
F1- Score:  0.35199628597957294


## 5. MAKING PREDICTIONS BASED ON AN INDIVIUAL's DATA

In [17]:
from pyspark.ml.feature import VectorAssembler
numericCols = ['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income']

#### 5.1 Predicting on person with poor health and habits

In [18]:
poorHealthPerson = [{ "HeartDisease" : -1, "State" : 22 , "BMI" : 35, "HighBP" : 1, "HighChol" : 1, "CholCheck" :1, \
             "FruitConsume" : 0, "VegetableConsume" : 0 , "Smoker" : 1 , "HeavyDrinker" : 1, \
                    "Diabetes" : 1 , "Stroke" : 1 , "Healthcare": 0 , "NoDoctorDueToCost" : 1, \
                    "PhysicalActivity" : 0 , "GeneralHealth": 1 ,"PhysicalHealth" : 20 , "MentalHealth" : 5 ,\
                    "DifficultyWalking" : 1 , "Gender" : 1 ,"Age" : 10 , "Education" : 2 , "Income": 1 }]
             
dfPoorHealth = spark.createDataFrame(poorHealthPerson)

assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
poorHealthSample= assembler.transform(dfPoorHealth)

In [19]:
predictions = persistedModel.transform(poorHealthSample)
predictions.select("prediction").show()

+----------+
|prediction|
+----------+
|       1.0|
+----------+



#### 5.2 Predicting on person with good health and habits

In [20]:
goodHealthPerson = [{ "HeartDisease" : -1, "State" : 22 , "BMI" : 20, "HighBP" : 0, "HighChol" : 0, "CholCheck" :0, \
             "FruitConsume" : 1, "VegetableConsume" : 1 , "Smoker" : 0 , "HeavyDrinker" : 0, \
                    "Diabetes" : 0 , "Stroke" : 1 , "Healthcare": 1 , "NoDoctorDueToCost" : 0, \
                    "PhysicalActivity" : 1 , "GeneralHealth": 4 ,"PhysicalHealth" : 3 , "MentalHealth" : 1 ,\
                    "DifficultyWalking" : 0 , "Gender" : 0 ,"Age" : 10 , "Education" : 2 , "Income": 1 }]
             
dfGoodHealth = spark.createDataFrame(goodHealthPerson)
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
goodHealthSample= assembler.transform(dfGoodHealth)

In [21]:
predictions = persistedModel.transform(goodHealthSample)
predictions.select("prediction").show()

+----------+
|prediction|
+----------+
|       0.0|
+----------+



#### 5.3 Enter your stats to predict

In [25]:
BMI = 21
HighBP = 0
HighChol = 0
CholCheck = 0
FruitConsume = 0
VegetableConsume = 1
Smoker = 0
HeavyDrinker = 0
Diabetes = 0 
Stroke = 0
Healthcare = 1
NoDoctorDueToCost = 0
PhysicalActivity = 0
GeneralHealth = 4
PhysicalHealth = 2
MentalHealth = 0
DifficultyWalking = 0
Gender = 0
Age = 3
Education = 4
Education = 4

In [26]:
yourData = [{ "HeartDisease" : -1, "State" : 22 , "BMI" : BMI, "HighBP" : HighBP, "HighChol" : HighChol, "CholCheck" :CholCheck, \
             "FruitConsume" : FruitConsume, "VegetableConsume" : VegetableConsume , "Smoker" : Smoker , "HeavyDrinker" : HeavyDrinker, \
                    "Diabetes" : Diabetes , "Stroke" : Stroke , "Healthcare": Healthcare , "NoDoctorDueToCost" : NoDoctorDueToCost, \
                    "PhysicalActivity" : PhysicalActivity , "GeneralHealth": GeneralHealth ,"PhysicalHealth" : PhysicalHealth , "MentalHealth" : MentalHealth ,\
                    "DifficultyWalking" : DifficultyWalking , "Gender" : Gender,"Age" : Age , "Education" : Education , "Income": Education }]
             
dfYourData = spark.createDataFrame(yourData)
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
yourSample= assembler.transform(dfYourData)

In [27]:
predictions = persistedModel.transform(yourSample)
print( "Prediction based on your data are as follows: ")
predictions.select("prediction").show()

Prediction based on your data are as follows: 
+----------+
|prediction|
+----------+
|       0.0|
+----------+



# ---- END ---