In [None]:
from pyspark.sql import SparkSession
import sys
import re
import os
import warnings

In [None]:
spark = SparkSession.builder.appName("myapp").getOrCreate()

In [None]:
print(os.environ['SPARK_HOME'])
warnings.filterwarnings('ignore')

In [None]:
# Helper Method to create classification Report
def makeClassificationReport(metricsArray):
    TN = metricsArray[0][0]
    FN = metricsArray[1][0]
    FP = metricsArray[0][1]
    TP = metricsArray[1][1]
    Accuracy = (TP+TN)/(TP+FN+TN+FP)
    Precision = TP/(TP+FP)
    Recall = TP / (TP + FN)
    F1Score = 2*(Precision * Recall)/(Precision + Recall)
    print("Classification Report")
    print("Accuracy: ", Accuracy)
    print("Precision: ",Precision)
    print("Recall: ",Recall)
    print("F1- Score: ", F1Score)

## 1. READING DEMONSTRATION DATA 
- We are reading the cleaned dataset set prepared in previous Juypter Notebook
- Data is stored on S3 in following location:  s3://brfss-big-data-project/HeartRiskData/


In [None]:
# READ LOCAL DATA FILE
# Comment if reading from S3

# heartData = spark.read.csv("../../../BRFSS/HeartRiskData/", header='true',inferSchema='true')

In [None]:
# READ FROM S3 BUCKET
#Comment if reading locally
sc._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

heartData = spark.read.csv("s3a://brfss-big-data-project/HeartRiskData/", header = 'true',inferSchema='true')

In [None]:
from pyspark.ml.feature import VectorAssembler

numericCols = ['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income']
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
df = assembler.transform(heartData)

# We are using the 10% data that was previously reserved (same seed)
modelData, demoData = df.randomSplit([0.9, 0.1], seed = 2018)
print("Demonstration Dataset Count: " + str(demoData.count()))

In [None]:
df.show(1)

In [None]:
df.printSchema()

## 3. READING SAVED MODEL

In [None]:
from pyspark.ml.classification import RandomForestClassificationModel

In [None]:
# READ LOCAL MODEL
# Comment if Reading from S3
# mPath = "../model/"
# persistedModel = RandomForestClassificationModel.load(mPath)

In [None]:
# READ MODEL FROM S3
# Comment if reading locally stored model
mPath = "s3a://brfss-big-data-project/model/"
persistedModel = RandomForestClassificationModel.load(mPath)


##  4. MAKING PREDICTIONS ON UNSEEN DEMONSTRATION DATA

In [None]:
# predict
predictions = persistedModel.transform(demoData)
predictions.select("HeartDisease", "prediction").show(10)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','HeartDisease']).withColumn('HeartDisease', F.col('HeartDisease').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','HeartDisease'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print("Confusuion Matrix:")
print(metrics.confusionMatrix().toArray())

In [None]:
makeClassificationReport(metrics.confusionMatrix().toArray())

## 5. MAKING PREDICTIONS BASED ON AN INDIVIUAL's DATA

In [None]:
from pyspark.ml.feature import VectorAssembler
numericCols = ['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income']

#### 5.1 Predicting on person with poor health and habits

In [None]:
poorHealthPerson = [{ "HeartDisease" : -1, "State" : 22 , "BMI" : 35, "HighBP" : 1, "HighChol" : 1, "CholCheck" :1, \
             "FruitConsume" : 0, "VegetableConsume" : 0 , "Smoker" : 1 , "HeavyDrinker" : 1, \
                    "Diabetes" : 1 , "Stroke" : 1 , "Healthcare": 0 , "NoDoctorDueToCost" : 1, \
                    "PhysicalActivity" : 0 , "GeneralHealth": 1 ,"PhysicalHealth" : 20 , "MentalHealth" : 5 ,\
                    "DifficultyWalking" : 1 , "Gender" : 1 ,"Age" : 10 , "Education" : 2 , "Income": 1 }]
             
dfPoorHealth = spark.createDataFrame(poorHealthPerson)

assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
poorHealthSample= assembler.transform(dfPoorHealth)

In [None]:
predictions = persistedModel.transform(poorHealthSample)
predictions.select("prediction").show()

#### 5.2 Predicting on person with good health and habits

In [None]:
goodHealthPerson = [{ "HeartDisease" : -1, "State" : 22 , "BMI" : 20, "HighBP" : 0, "HighChol" : 0, "CholCheck" :0, \
             "FruitConsume" : 1, "VegetableConsume" : 1 , "Smoker" : 0 , "HeavyDrinker" : 0, \
                    "Diabetes" : 0 , "Stroke" : 1 , "Healthcare": 1 , "NoDoctorDueToCost" : 0, \
                    "PhysicalActivity" : 1 , "GeneralHealth": 4 ,"PhysicalHealth" : 3 , "MentalHealth" : 1 ,\
                    "DifficultyWalking" : 0 , "Gender" : 0 ,"Age" : 10 , "Education" : 2 , "Income": 1 }]
             
dfGoodHealth = spark.createDataFrame(goodHealthPerson)
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
goodHealthSample= assembler.transform(dfGoodHealth)

In [None]:
predictions = persistedModel.transform(goodHealthSample)
predictions.select("prediction").show()

#### 5.3 Enter your stats to predict

In [None]:
BMI = 21
HighBP = 0
HighChol = 0
CholCheck = 0
c = 0
VegetableConsume = 1
Smoker = 0
HeavyDrinker = 0
Diabetes = 0 
Stroke = 0
Healthcare = 1
NoDoctorDueToCost = 0
PhysicalActivity = 0
GeneralHealth = 4
PhysicalHealth = 2
MentalHealth = 0
DifficultyWalking = 0
Gender = 0
Age = 3
Education = 4
Education = 4

In [None]:
yourData = [{ "HeartDisease" : -1, "State" : 22 , "BMI" : BMI, "HighBP" : HighBP, "HighChol" : HighChol, "CholCheck" :CholCheck, \
             "FruitConsume" : CholCheck, "VegetableConsume" : VegetableConsume , "Smoker" : Smoker , "HeavyDrinker" : HeavyDrinker, \
                    "Diabetes" : Diabetes , "Stroke" : Stroke , "Healthcare": Healthcare , "NoDoctorDueToCost" : NoDoctorDueToCost, \
                    "PhysicalActivity" : PhysicalActivity , "GeneralHealth": GeneralHealth ,"PhysicalHealth" : PhysicalHealth , "MentalHealth" : MentalHealth ,\
                    "DifficultyWalking" : DifficultyWalking , "Gender" : Gender,"Age" : Age , "Education" : Education , "Income": Education }]
             
dfYourData = spark.createDataFrame(yourData)
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
yourSample= assembler.transform(dfYourData)

In [None]:
predictions = persistedModel.transform(yourSample)
print( "Prediction based on your data are as follows: ")
predictions.select("prediction").show()

# ---- END ---