In [1]:
from pyspark.sql import SparkSession
import sys
import re
import os
import warnings

In [2]:
spark = SparkSession.builder.appName("myapp").getOrCreate()

In [3]:
print(os.environ['SPARK_HOME'])
warnings.filterwarnings('ignore')

/opt/homebrew/Cellar/apache-spark/3.2.1/libexec


## 1. READING CLEANED DATA 
- We are reading the cleaned dataset set prepared in previous Juypter Notebook
- Data is stored on S3 in following location:  s3://brfss-big-data-project/HeartRiskData/


In [4]:
# READ LOCAL DATA FILE
# Comment if reading from S3

heartData = spark.read.csv("../../../BRFSS/HeartRiskData/", header='true',inferSchema='true')

[Stage 1:>                                                          (0 + 8) / 8]                                                                                

In [5]:
# READ FROM S3 BUCKET
#Comment if reading locally
# sc._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
# sc._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
# sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# heartData = spark.read.csv("s3a://brfss-big-data-project/HeartRiskData/", header = 'true',inferSchema='true')

In [6]:
heartData.printSchema()

root
 |-- HeartDisease: double (nullable = true)
 |-- State: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- HighBP: double (nullable = true)
 |-- HighChol: double (nullable = true)
 |-- CholCheck: double (nullable = true)
 |-- FruitConsume: double (nullable = true)
 |-- VegetableConsume: double (nullable = true)
 |-- Smoker: double (nullable = true)
 |-- HeavyDrinker: double (nullable = true)
 |-- Diabetes: double (nullable = true)
 |-- Stroke: double (nullable = true)
 |-- Healthcare: double (nullable = true)
 |-- NoDoctorDueToCost: double (nullable = true)
 |-- PhysicalActivity: double (nullable = true)
 |-- GeneralHealth: double (nullable = true)
 |-- PhysicalHealth: double (nullable = true)
 |-- MentalHealth: double (nullable = true)
 |-- DifficultyWalking: double (nullable = true)
 |-- Gender: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Education: double (nullable = true)
 |-- Income: double (nullable = true)



In [7]:
heartData.show(1)

+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+
|HeartDisease|State|  BMI|HighBP|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender|Age|Education|Income|
+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+
|         0.0| 22.0|23.91|   0.0|     0.0|      1.0|         1.0|             1.0|   1.0|         0.0|     0.0|   0.0|       0.0|              0.0|             1.0|          5.0|           0.0|         2.0|              0.0|   0.0|4.0|      

In [8]:
heartData.select(['HeartDisease','BMI']).show(5)

+------------+-----+
|HeartDisease|  BMI|
+------------+-----+
|         0.0|23.91|
|         1.0|39.15|
|         0.0|33.36|
|         0.0|29.84|
|         0.0|24.82|
+------------+-----+
only showing top 5 rows



In [9]:
print("Dimensions of the Data Frame:")
print((heartData.count(), len(heartData.columns)))

Dimensions of the Data Frame:
(519171, 23)


Note: We can see that after cleaning we have 0.5 million data points across 22 features with the target "HeartDisease" that indicates if the person is either suffering from heart disease or has had a heart attack. 
0 = No 
1 = Yes

## 2. EXPLORATORY DATA ANALYSIS

In [10]:
counts = heartData.groupBy('HeartDisease').count().collect()
total_counts = heartData.count()

In [11]:
print("Percentage not having any heart issues" )
print(counts[0][1]/total_counts*100)
print("Percentage havingheart issues" )
print(counts[1][1]/total_counts*100)

Percentage not having any heart issues
90.79494039536107
Percentage havingheart issues
9.205059604638933


Note: The data is imbalanced only 9.2% has heart issues ... so we may need to either oversample or undersample when training our models. Additionally we will have to consider measures other than just accuracy to judge our model performance. Precision, Recall and F1 score will have to be considered. 

#### 2.1 Descriptive Analysis

In [12]:
heartData.describe(['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume']).show()

+-------+-----------------+-------------------+-------------------+-------------------+-------------------+
|summary|              BMI|           HighChol|          CholCheck|       FruitConsume|   VegetableConsume|
+-------+-----------------+-------------------+-------------------+-------------------+-------------------+
|  count|           519171|             519171|             519171|             519171|             519171|
|   mean|28.56327531776717| 0.3915029922703695| 0.9600959991987226| 0.6461416373410688|  0.833698338312425|
| stddev|6.330915524634669|0.48808693711025664|0.19573386348664162|0.47816635414421166|0.37235156245084894|
|    min|             12.0|                0.0|                0.0|                0.0|                0.0|
|    max|             98.7|                1.0|                1.0|                1.0|                1.0|
+-------+-----------------+-------------------+-------------------+-------------------+-------------------+



In [13]:
heartData.describe(['Smoker','HeavyDrinker','Diabetes','Stroke']).show()

+-------+-------------------+-------------------+-------------------+-------------------+
|summary|             Smoker|       HeavyDrinker|           Diabetes|             Stroke|
+-------+-------------------+-------------------+-------------------+-------------------+
|  count|             519171|             519171|             519171|             519171|
|   mean|     0.431405066924|0.06193912988206198|0.17125571343545767|0.04223656560169963|
| stddev|0.49527286179548236|0.24104519490348594|0.37673262060037704|0.20112860573716723|
|    min|                0.0|                0.0|                0.0|                0.0|
|    max|                1.0|                1.0|                1.0|                1.0|
+-------+-------------------+-------------------+-------------------+-------------------+



In [14]:
heartData.describe(['Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth']).show()

+-------+-------------------+-------------------+------------------+------------------+
|summary|         Healthcare|  NoDoctorDueToCost|  PhysicalActivity|     GeneralHealth|
+-------+-------------------+-------------------+------------------+------------------+
|  count|             519171|             519171|            519171|            519171|
|   mean| 0.9390759499278658|0.09431189338387545|0.7490788198878597|3.4484206552369066|
| stddev|0.23919117959178865|0.29226242433231586|0.4335436581501136|1.0641187958099112|
|    min|                0.0|                0.0|               0.0|               1.0|
|    max|                1.0|                1.0|               1.0|               5.0|
+-------+-------------------+-------------------+------------------+------------------+



In [15]:
heartData.describe(['PhysicalHealth','MentalHealth','DifficultyWalking']).show()

+-------+-----------------+------------------+-------------------+
|summary|   PhysicalHealth|      MentalHealth|  DifficultyWalking|
+-------+-----------------+------------------+-------------------+
|  count|           519171|            519171|             519171|
|   mean|4.361489759636036|3.5516814305883804|0.16746120257102187|
| stddev|8.792808096235884| 7.733453215085699|0.37338748873597627|
|    min|              0.0|               0.0|                0.0|
|    max|             30.0|              30.0|                1.0|
+-------+-----------------+------------------+-------------------+



In [16]:
heartData.describe(['Gender','Age','Education','Income']).show()

+-------+-------------------+------------------+------------------+-----------------+
|summary|             Gender|               Age|         Education|           Income|
+-------+-------------------+------------------+------------------+-----------------+
|  count|             519171|            519171|            519171|           519171|
|   mean|0.46337911786290065|7.9175435453829275| 5.063150676751976|6.083748899688157|
| stddev| 0.4986575878758885| 3.241323105698738|0.9768962159188829|2.073786990025714|
|    min|                0.0|               1.0|               1.0|              1.0|
|    max|                1.0|              13.0|               6.0|              8.0|
+-------+-------------------+------------------+------------------+-----------------+



Notes: Most of the data is boolean or binned like for Age. The data is quite consistent . BMI max is at 98.7 which is  large but is a possible value so we will not remove such values.

#### 2.2 Correlation Analysis

In [17]:
from pyspark.ml.stat import Correlation

In [18]:
colNames = heartData.columns

In [19]:
# This cell takes a lot of time 
corrList = []
for col in colNames:
    pearsonCorr = heartData.corr('HeartDisease',col)
    corrList.append(("BMI - "+col,pearsonCorr))

In [20]:
corrList

[('BMI - HeartDisease', 1.0),
 ('BMI - State', 0.006444321393978534),
 ('BMI - BMI', 0.05058519245234513),
 ('BMI - HighBP', 0.2098579059479823),
 ('BMI - HighChol', 0.1859344957309097),
 ('BMI - CholCheck', 0.04574915829500349),
 ('BMI - FruitConsume', -0.012932002071759112),
 ('BMI - VegetableConsume', -0.023949296306889675),
 ('BMI - Smoker', 0.11680965215675526),
 ('BMI - HeavyDrinker', -0.03037897687609935),
 ('BMI - Diabetes', 0.17086995852514197),
 ('BMI - Stroke', 0.1990394390425505),
 ('BMI - Healthcare', 0.027174079890363554),
 ('BMI - NoDoctorDueToCost', 0.026098505344340245),
 ('BMI - PhysicalActivity', -0.08797288374111345),
 ('BMI - GeneralHealth', -0.2507726735558453),
 ('BMI - PhysicalHealth', 0.18450992906634142),
 ('BMI - MentalHealth', 0.059861227400692135),
 ('BMI - DifficultyWalking', 0.20788019512527584),
 ('BMI - Gender', 0.080903183631275),
 ('BMI - Age', 0.21882047127416504),
 ('BMI - Education', -0.08831463775581311),
 ('BMI - Income', -0.1298266025074565)]

## 3. SETUP DATA FOR CLASSIFICATION MODELS

In [21]:
# Helper Method to create classification Report
def makeClassificationReport(metricsArray):
    TN = metricsArray[0][0]
    FN = metricsArray[0][1]
    FP = metricsArray[1][0]
    TP = metricsArray[1][1]
    Accuracy = (TP+TN)/(TP+FN+TN+FP)
    Precision = TP/(TP+FP)
    Recall = TP / (TP + FN)
    F1Score = 2*(Precision * Recall)/(Precision + Recall)
    print("Classification Report")
    print("Accuracy: ", Accuracy)
    print("Precision: ",Precision)
    print("Recall: ",Recall)
    print("F1- Score: ", F1Score)

In [22]:
from pyspark.ml.feature import VectorAssembler

numericCols = ['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income']
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
df = assembler.transform(heartData)
df.show(2)

+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+--------------------+
|HeartDisease|State|  BMI|HighBP|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender|Age|Education|Income|            features|
+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+---+---------+------+--------------------+
|         0.0| 22.0|23.91|   0.0|     0.0|      1.0|         1.0|             1.0|   1.0|         0.0|     0.0|   0.0|       0.0|              0.0|             1.0|          5.0|

22/06/02 00:06:31 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [23]:
# We first reserve 10% of the data to use for demonstarating the model we have developed
# This will be completely unseen during training and selecting the model
modelData, demoData = df.randomSplit([0.9, 0.1], seed = 2018)
print("Training Dataset Count: " + str(modelData.count()))
print("Test Dataset Count: " + str(demoData.count()))

                                                                                

Training Dataset Count: 467119


[Stage 78:>                                                         (0 + 8) / 8]

Test Dataset Count: 52052


                                                                                

In [24]:
# We now do a train test split on the modelData only
train, test = modelData.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

                                                                                

Training Dataset Count: 326889


[Stage 84:>                                                         (0 + 8) / 8]

Test Dataset Count: 140230


                                                                                

## 4. CLASSIFICATION MODEL 1 : NO OVERSAMPLING O UNDER SAMPLING 

#### 4.1 Random Forest Classifier

In [25]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'HeartDisease')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select('BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income', \
               'HeartDisease', 'rawPrediction', 'prediction', 'probability').show(2)


                                                                                

+-----+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+------------+--------------------+----------+--------------------+
|  BMI|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender| Age|Education|Income|HeartDisease|       rawPrediction|prediction|         probability|
+-----+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+------------+--------------------+----------+--------------------+
|16.47|     0.0|      1.0|         1.0|             1.0|   0.0|         0.0|     0.0|   0.0|       1.0|              0

#### 4.2 Classification Metrics

In [26]:
predictions.select("HeartDisease", "prediction").show(5)

+------------+----------+
|HeartDisease|prediction|
+------------+----------+
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
+------------+----------+
only showing top 5 rows



In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

[Stage 106:>                                                        (0 + 8) / 8]

Accuracy = 0.8639300477858837
Test Error = 0.13606995221411633




In [28]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','HeartDisease']).withColumn('HeartDisease', F.col('HeartDisease').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','HeartDisease'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())

                                                                                

[[127301.      0.]
 [ 12929.      0.]]


In [29]:
makeClassificationReport(metrics.confusionMatrix().toArray())

Classification Report
Accuracy:  0.9078014690151893
Precision:  0.0
Recall:  nan
F1- Score:  nan


## 5. CLASSIFICATION MODEL WITH OVERSAMPLING

#### 5.1 Code to oversample the training data

In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, explode, array, lit

In [31]:
major_train = train.filter(col("HeartDisease") == 0)
minor_train = train.filter(col("HeartDisease") == 1)

ratio = int(major_train.count()/minor_train.count())
print("ratio: {}".format(ratio))

[Stage 118:>                                                        (0 + 8) / 8]

ratio: 9




In [32]:
a = range(ratio)

# duplicate the minority rows
oversampled_train = minor_train.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')

In [33]:
# combine both oversampled minority rows and previous majority rows 
combined_train = major_train.unionAll(oversampled_train)
combined_train.show(2)

+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+--------------------+
|HeartDisease|State|  BMI|HighBP|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender| Age|Education|Income|            features|
+------------+-----+-----+------+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+--------------------+
|         0.0| 22.0|16.73|   0.0|     0.0|      1.0|         1.0|             1.0|   1.0|         0.0|     1.0|   0.0|       1.0|              0.0|             1.0|          1

In [34]:
# Counts after oversampling the training data
counts = combined_train.groupBy('HeartDisease').count().collect()
total_counts = combined_train.count()


print("Total Training Data size",combined_train.count())
print("Percentage not having any heart issues" )
print(counts[0][1]/total_counts*100)
print("Percentage havingheart issues" )
print(counts[1][1]/total_counts*100)

22/06/02 00:06:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:06:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:06:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:06:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:06:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Total Training Data size 566841
Percentage not having any heart issues
52.377121626699555
Percentage havingheart issues
47.622878373300445




#### 5.2 Random Forest Classifier

In [35]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'HeartDisease')
rfModel = rf.fit(combined_train)
predictions = rfModel.transform(test)
predictions.select('BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income', \
               'HeartDisease', 'rawPrediction', 'prediction', 'probability').show(2)


[Stage 148:>                                                        (0 + 1) / 1]

+-----+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+------------+--------------------+----------+--------------------+
|  BMI|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender| Age|Education|Income|HeartDisease|       rawPrediction|prediction|         probability|
+-----+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+------------+--------------------+----------+--------------------+
|16.47|     0.0|      1.0|         1.0|             1.0|   0.0|         0.0|     0.0|   0.0|       1.0|              0

                                                                                

#### 5.3 Classification Metrics

In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

[Stage 149:>                                                        (0 + 8) / 8]

Accuracy = 0.7946650039795793
Test Error = 0.2053349960204207




In [37]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','HeartDisease']).withColumn('HeartDisease', F.col('HeartDisease').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','HeartDisease'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print("Confusuion Matrix:")
print(metrics.confusionMatrix().toArray())

                                                                                

Confusuion Matrix:
[[94241. 33060.]
 [ 3015.  9914.]]


In [38]:
makeClassificationReport(metrics.confusionMatrix().toArray())

Classification Report
Accuracy:  0.7427440633245382
Precision:  0.7668033103875009
Recall:  0.230697631125797
F1- Score:  0.35468579503783343


## 6. CLASSIFICATION MODEL WITH UNDERSAMPLING

#### 6.1 Code to undersample the training data

In [39]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, explode, array, lit

In [40]:
major_train = train.filter(col("HeartDisease") == 0)
minor_train = train.filter(col("HeartDisease") == 1)

ratio = int(major_train.count()/minor_train.count())
print("ratio: {}".format(ratio))



ratio: 9


                                                                                

In [41]:
sampled_majority_df = major_train.sample(False, 1/ratio)
combined_train = sampled_majority_df.unionAll(minor_train)

In [42]:
#Counts after undersampling the training data
counts = combined_train.groupBy('HeartDisease').count().collect()
total_counts = combined_train.count()

print("Total Training dataset Size", combined_train.count())
print("Percentage not having any heart issues" )
print(counts[0][1]/total_counts*100)
print("Percentage havingheart issues" )
print(counts[1][1]/total_counts*100)

22/06/02 00:07:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:07:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:07:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:07:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:07:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/06/02 00:07:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Total Training dataset Size 63050
Percentage not having any heart issues
52.42823156225218
Percentage havingheart issues
47.57176843774782




#### 6.2 Random Forest Classifier

In [43]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'HeartDisease')
rfModel = rf.fit(combined_train)
predictions = rfModel.transform(test)
predictions.select('BMI','HighChol','CholCheck','FruitConsume','VegetableConsume','Smoker','HeavyDrinker', \
               'Diabetes','Stroke','Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth', \
               'PhysicalHealth','MentalHealth','DifficultyWalking','Gender','Age','Education','Income', \
               'HeartDisease', 'rawPrediction', 'prediction', 'probability').show(2)


                                                                                

+-----+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+------------+--------------------+----------+--------------------+
|  BMI|HighChol|CholCheck|FruitConsume|VegetableConsume|Smoker|HeavyDrinker|Diabetes|Stroke|Healthcare|NoDoctorDueToCost|PhysicalActivity|GeneralHealth|PhysicalHealth|MentalHealth|DifficultyWalking|Gender| Age|Education|Income|HeartDisease|       rawPrediction|prediction|         probability|
+-----+--------+---------+------------+----------------+------+------------+--------+------+----------+-----------------+----------------+-------------+--------------+------------+-----------------+------+----+---------+------+------------+--------------------+----------+--------------------+
|16.47|     0.0|      1.0|         1.0|             1.0|   0.0|         0.0|     0.0|   0.0|       1.0|              0

#### 6.3 Classification Metrics

In [44]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.786706714861117
Test Error = 0.21329328513888302




In [45]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','HeartDisease']).withColumn('HeartDisease', F.col('HeartDisease').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','HeartDisease'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print("Confusuion Matrix:")
print(metrics.confusionMatrix().toArray())

[Stage 193:>                                                        (0 + 8) / 8]                                                                                

Confusuion Matrix:
[[92461. 34840.]
 [ 2771. 10158.]]


In [46]:
makeClassificationReport(metrics.confusionMatrix().toArray())

Classification Report
Accuracy:  0.7317906296798118
Precision:  0.7856756129631062
Recall:  0.22574336637183876
F1- Score:  0.3507172820964317


In [47]:
# save best model to specified path
# Comment after running once
# mPath =  "../model/"
#rfModel.write().overwrite().save(mPath)

### Note: We are choosing the Model with Undersampled since it has slightly better metrics. 