In [1]:
from pyspark.sql import SparkSession
import sys
import re
import os

In [2]:
spark = SparkSession.builder.appName("myapp").getOrCreate()

In [3]:
print(os.environ['SPARK_HOME'])

/opt/homebrew/Cellar/apache-spark/3.2.1/libexec


## 1. READING CLEANED DATA 
- We are reading the cleaned dataset set prepared in previous Juypter Notebook
- Data is stored on S3 in following location:  s3://brfss-big-data-project/HeartRiskData/


In [4]:
# READ LOCAL DATA FILE
# Comment if reading from S3

heartData = spark.read.csv("../../../BRFSS/HeartRiskData/", header='true',inferSchema='true')

                                                                                

In [8]:
# READ FROM S3 BUCKET
#Comment if reading locally
# sc._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
# sc._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
# sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# heartData = spark.read.csv("s3a://brfss-big-data-project/HeartRiskData/", header = 'true',inferSchema='true')

                                                                                

In [10]:
heartData.printSchema()

root
 |-- HeartDisease: double (nullable = true)
 |-- State: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- HighBP: double (nullable = true)
 |-- HighChol: double (nullable = true)
 |-- CholCheck: double (nullable = true)
 |-- FruitConsume: double (nullable = true)
 |-- VegetableConsume: double (nullable = true)
 |-- Smoker: double (nullable = true)
 |-- HeavyDrinker: double (nullable = true)
 |-- Diabetes: double (nullable = true)
 |-- Stroke: double (nullable = true)
 |-- Healthcare: double (nullable = true)
 |-- NoDoctorDueToCost: double (nullable = true)
 |-- PhysicalActivity: double (nullable = true)
 |-- GeneralHealth: double (nullable = true)
 |-- PhysicalHealth: double (nullable = true)
 |-- MentalHealth: double (nullable = true)
 |-- DifficultyWalking: double (nullable = true)
 |-- Gender: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Education: double (nullable = true)
 |-- Income: double (nullable = true)



In [12]:
heartData.select(['HeartDisease','BMI']).show(5)

+------------+-----+
|HeartDisease|  BMI|
+------------+-----+
|         0.0|23.91|
|         1.0|39.15|
|         0.0|33.36|
|         0.0|29.84|
|         0.0|24.82|
+------------+-----+
only showing top 5 rows



In [13]:
print("Dimensions of the Data Frame:")
print((heartData.count(), len(heartData.columns)))

Dimensions of the Data Frame:




(519171, 23)


                                                                                

Note: We can see that after cleaning we have 0.5 million data points across 22 features with the target "HeartDisease" that indicates if the person is either suffering from heart disease or has had a heart attack. 
0 = No 
1 = Yes

## 2. EXPLORATORY DATA ANALYSIS

In [27]:
counts = heartData.groupBy('HeartDisease').count().collect()
total_counts = heartData.count()

                                                                                

In [29]:
print("Percentage not having any heart issues" )
print(counts[0][1]/total_counts*100)
print("Percentage havingheart issues" )
print(counts[1][1]/total_counts*100)

Percentage not having any heart issues
90.79494039536107
Percentage havingheart issues
9.205059604638933


Note: The data is imbalanced only 9.2% has heart issues ... so we may need to either oversample or undersample when training our models. Additionally we will have to consider measures other than just accuracy to judge our model performance. Precision, Recall and F1 score will have to be considered. 

#### 2.1 Descriptive Analysis

In [34]:
heartData.describe(['BMI','HighChol','CholCheck','FruitConsume','VegetableConsume']).show()



+-------+-----------------+-------------------+-------------------+-------------------+-------------------+
|summary|              BMI|           HighChol|          CholCheck|       FruitConsume|   VegetableConsume|
+-------+-----------------+-------------------+-------------------+-------------------+-------------------+
|  count|           519171|             519171|             519171|             519171|             519171|
|   mean|28.56327531776717| 0.3915029922703695| 0.9600959991987226| 0.6461416373410688|  0.833698338312425|
| stddev|6.330915524634669|0.48808693711025664|0.19573386348664162|0.47816635414421166|0.37235156245084894|
|    min|             12.0|                0.0|                0.0|                0.0|                0.0|
|    max|             98.7|                1.0|                1.0|                1.0|                1.0|
+-------+-----------------+-------------------+-------------------+-------------------+-------------------+



                                                                                

In [35]:
heartData.describe(['Smoker','HeavyDrinker','Diabetes','Stroke']).show()



+-------+-------------------+-------------------+-------------------+-------------------+
|summary|             Smoker|       HeavyDrinker|           Diabetes|             Stroke|
+-------+-------------------+-------------------+-------------------+-------------------+
|  count|             519171|             519171|             519171|             519171|
|   mean|     0.431405066924|0.06193912988206198|0.17125571343545767|0.04223656560169963|
| stddev|0.49527286179548236|0.24104519490348594|0.37673262060037704|0.20112860573716723|
|    min|                0.0|                0.0|                0.0|                0.0|
|    max|                1.0|                1.0|                1.0|                1.0|
+-------+-------------------+-------------------+-------------------+-------------------+



                                                                                

In [36]:
heartData.describe(['Healthcare','NoDoctorDueToCost','PhysicalActivity','GeneralHealth']).show()



+-------+-------------------+-------------------+------------------+------------------+
|summary|         Healthcare|  NoDoctorDueToCost|  PhysicalActivity|     GeneralHealth|
+-------+-------------------+-------------------+------------------+------------------+
|  count|             519171|             519171|            519171|            519171|
|   mean| 0.9390759499278658|0.09431189338387545|0.7490788198878597|3.4484206552369066|
| stddev|0.23919117959178865|0.29226242433231586|0.4335436581501136|1.0641187958099112|
|    min|                0.0|                0.0|               0.0|               1.0|
|    max|                1.0|                1.0|               1.0|               5.0|
+-------+-------------------+-------------------+------------------+------------------+





In [37]:
heartData.describe(['PhysicalHealth','MentalHealth','DifficultyWalking']).show()



+-------+-----------------+------------------+-------------------+
|summary|   PhysicalHealth|      MentalHealth|  DifficultyWalking|
+-------+-----------------+------------------+-------------------+
|  count|           519171|            519171|             519171|
|   mean|4.361489759636036|3.5516814305883804|0.16746120257102187|
| stddev|8.792808096235884| 7.733453215085699|0.37338748873597627|
|    min|              0.0|               0.0|                0.0|
|    max|             30.0|              30.0|                1.0|
+-------+-----------------+------------------+-------------------+



                                                                                

In [38]:
heartData.describe(['Gender','Age','Education','Income']).show()



+-------+-------------------+------------------+------------------+-----------------+
|summary|             Gender|               Age|         Education|           Income|
+-------+-------------------+------------------+------------------+-----------------+
|  count|             519171|            519171|            519171|           519171|
|   mean|0.46337911786290065|7.9175435453829275| 5.063150676751976|6.083748899688157|
| stddev| 0.4986575878758885| 3.241323105698738|0.9768962159188829|2.073786990025714|
|    min|                0.0|               1.0|               1.0|              1.0|
|    max|                1.0|              13.0|               6.0|              8.0|
+-------+-------------------+------------------+------------------+-----------------+



                                                                                

Notes: Most of the data is boolean or binned like for Age. The data is quite consistent . BMI max is at 98.7 which is  large but is a possible value so we will not remove such values.

#### 2.2 Correlation Analysis

In [40]:
from pyspark.ml.stat import Correlation

In [49]:
colNames = heartData.columns

In [58]:
# This cell takes a lot of time 
corrList = []
for col in colNames:
    pearsonCorr = heartData.corr('HeartDisease',col)
    corrList.append(("BMI - "+col,pearsonCorr))

                                                                                

In [59]:
corrList

[('BMI - HeartDisease', 1.0),
 ('BMI - State', 0.006444321393978534),
 ('BMI - BMI', 0.05058519245234513),
 ('BMI - HighBP', 0.2098579059479823),
 ('BMI - HighChol', 0.1859344957309097),
 ('BMI - CholCheck', 0.04574915829500349),
 ('BMI - FruitConsume', -0.012932002071759112),
 ('BMI - VegetableConsume', -0.023949296306889675),
 ('BMI - Smoker', 0.11680965215675526),
 ('BMI - HeavyDrinker', -0.03037897687609935),
 ('BMI - Diabetes', 0.17086995852514197),
 ('BMI - Stroke', 0.1990394390425505),
 ('BMI - Healthcare', 0.027174079890363554),
 ('BMI - NoDoctorDueToCost', 0.026098505344340245),
 ('BMI - PhysicalActivity', -0.08797288374111345),
 ('BMI - GeneralHealth', -0.2507726735558453),
 ('BMI - PhysicalHealth', 0.18450992906634142),
 ('BMI - MentalHealth', 0.059861227400692135),
 ('BMI - DifficultyWalking', 0.20788019512527584),
 ('BMI - Gender', 0.080903183631275),
 ('BMI - Age', 0.21882047127416504),
 ('BMI - Education', -0.08831463775581311),
 ('BMI - Income', -0.1298266025074565)]

## 3. CLASSIFICATION MODEL