In [1]:
from pyspark import SparkContext
import sys
import re
import os

In [2]:
spark = SparkSession(sc)

In [3]:
print(os.environ['SPARK_HOME'])

/opt/homebrew/Cellar/apache-spark/3.2.1/libexec


## 1. READING DATA INTO SPARK FRAMES
- We are reading two datasets 
    - BRFSS 2019 survey
    - BRFSS 2017 survey
- Not using the 2018 survey since it has significant differences in the key features that we extracting for the ML classification
- Related Work: https://www.kaggle.com/alexteboul/diabetes-health-indicators-dataset-notebook
    - The related work uses variuos lifestyle indicator habits, prior chronic disease indicator from the BRFSS survey to identify Diabetes risk for 2015 BRFSS survey
    - We will use similar indicators to see if we can predict risk of Heart Attack and Heart Disease.We will be using two yeas of data (2019 and 2017)

### Dataset Links 
- Original Links on CDC Website
    - https://www.cdc.gov/brfss/annual_data/annual_2017.html
    - https://www.cdc.gov/brfss/annual_data/annual_2019.html
- We have uploaded to S3 for easy access the locations are
    - s3a://brfss-big-data-project/BRFSS_2017.csv
    - s3a://brfss-big-data-project/BRFSS_2019.csv
    

In [4]:
# READ LOCAL DATA FILE
# Comment if reading from S3
df2019 = spark.read.csv("../../../BRFSS/CSV_version/BRFSS_2019.csv", header='true',inferSchema='true')
df2017 = spark.read.csv("../../../BRFSS/CSV_version/BRFSS_2017.csv", header='true',inferSchema='true')

                                                                                

In [5]:
# READ FROM S3 BUCKET
# Comment if reading locally
# sc._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
# sc._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
# sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# df2019 = spark.read.csv("s3a://brfss-big-data-project/BRFSS_2019.csv", header = 'true',inferSchema='true')
# df2017 = spark.read.csv("s3a://brfss-big-data-project/BRFSS_2017.csv", header = 'true',inferSchema='true')

In [6]:
df2019.printSchema()

root
 |-- _STATE: double (nullable = true)
 |-- FMONTH: double (nullable = true)
 |-- IDATE: integer (nullable = true)
 |-- IMONTH: integer (nullable = true)
 |-- IDAY: integer (nullable = true)
 |-- IYEAR: integer (nullable = true)
 |-- DISPCODE: double (nullable = true)
 |-- SEQNO: integer (nullable = true)
 |-- _PSU: double (nullable = true)
 |-- CTELENM1: double (nullable = true)
 |-- PVTRESD1: double (nullable = true)
 |-- COLGHOUS: double (nullable = true)
 |-- STATERE1: double (nullable = true)
 |-- CELPHONE: double (nullable = true)
 |-- LADULT1: double (nullable = true)
 |-- COLGSEX: double (nullable = true)
 |-- NUMADULT: double (nullable = true)
 |-- LANDSEX: double (nullable = true)
 |-- NUMMEN: double (nullable = true)
 |-- NUMWOMEN: double (nullable = true)
 |-- RESPSLCT: double (nullable = true)
 |-- SAFETIME: double (nullable = true)
 |-- CTELNUM1: double (nullable = true)
 |-- CELLFON5: double (nullable = true)
 |-- CADULT1: double (nullable = true)
 |-- CELLSEX: doubl

In [7]:
df2017.select(['_STATE','_VEGLT1A']).show(5)

+------+--------+
|_STATE|_VEGLT1A|
+------+--------+
|   1.0|     1.0|
|   1.0|     1.0|
|   1.0|     2.0|
|   1.0|     9.0|
|   1.0|     2.0|
+------+--------+
only showing top 5 rows



In [8]:
print("Dimensions of the Data Frame:")
print((df2017.count(), len(df2017.columns)))

Dimensions of the Data Frame:
(450016, 358)


Note: We can see that the complete brfss dataset has about 0.4 million records and 358 columns in 2017 survey. The 2019 survey is similar in dimension

We are only interested in the columns related to the prediction of heart attacks and heart disease in individuals. The current column names are based on a code book that the CDC maintains. We will selected relavent columns using the code book to map key indicators.

- Link to code book: https://www.cdc.gov/brfss/annual_data/annual_2019.html
- Link to code book: https://www.cdc.gov/brfss/annual_data/annual_2017.html

## 2. SELECTING THE COLUMNS OF INTEREST

In [9]:
# Selecting coloumns containing indicators for Heart Disease

In [10]:
cols_to_select2017 = ["_MICHD", # target variable if person has had Heart Attack aka Myocardial Infraction
                  "_STATE","_BMI5", # State person belongs to, Body Mass Index 
                 "_RFHYPE5","TOLDHI2","CHOLCHK1", # BP and cholestrol
                 "_FRTLT1A","_VEGLT1A", "SMOKE100","_RFDRHV5", # Food (Fruit and vegetable consumption), alcohol and smoking
                 "DIABETE3","CVDSTRK3", # chronic diseases = Diabetes, Stroke
                 "HLTHPLN1","MEDCOST", # Insurance and medical access
                 "_TOTINDA","GENHLTH","PHYSHLTH","MENTHLTH","DIFFWALK", #Fitness and activity 
                 "SEX","_AGEG5YR","EDUCA","INCOME2"] # demographic Data = Gender, Age bracket, Education level, Income bracket

In [11]:
cols_to_select2019 = ["_MICHD", # target variable if person has had Heart Attack aka Myocardial Infraction
                  "_STATE","_BMI5", # State person belongs to, Body Mass Index 
                 "_RFHYPE5","TOLDHI2","CHOLCHK2", # BP and cholestrol
                 "_FRTLT1A","_VEGLT1A", "SMOKE100","_RFDRHV7", # Food (Fruit and vegetable consumption), alcohol and smoking
                 "DIABETE4","CVDSTRK3", # chronic diseases = Diabetes, Stroke
                 "HLTHPLN1","MEDCOST", # Insurance and medical access
                 "_TOTINDA","GENHLTH","PHYSHLTH","MENTHLTH","DIFFWALK", #Fitness and activity 
                 "SEXVAR","_AGEG5YR","EDUCA","INCOME2"] # demographic Data = Gender, Age bracket, Education level, Income bracket

In [12]:
heartDisease2019 = df2019.select(cols_to_select2019)
heartDisease2019.show(5)

+------+------+------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+------+--------+-----+-------+
|_MICHD|_STATE| _BMI5|_RFHYPE5|TOLDHI2|CHOLCHK2|_FRTLT1A|_VEGLT1A|SMOKE100|_RFDRHV7|DIABETE4|CVDSTRK3|HLTHPLN1|MEDCOST|_TOTINDA|GENHLTH|PHYSHLTH|MENTHLTH|DIFFWALK|SEXVAR|_AGEG5YR|EDUCA|INCOME2|
+------+------+------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+------+--------+-----+-------+
|   2.0|   1.0|2817.0|     2.0|    1.0|     2.0|     1.0|     1.0|     1.0|     1.0|     3.0|     2.0|     1.0|    2.0|     2.0|    3.0|    15.0|    88.0|     1.0|   2.0|    13.0|  3.0|    3.0|
|   2.0|   1.0|1854.0|     1.0|    2.0|     2.0|     1.0|     1.0|     2.0|     1.0|     3.0|     2.0|     1.0|    2.0|     1.0|    4.0|    10.0|    88.0|     2.0|   2.0|    11.0|  5.0|    5.0|
|   2.0|   1.0|3162.0|     2.0

In [13]:
heartDisease2017 = df2017.select(cols_to_select2017)
heartDisease2017.show(5)

+------+------+------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+---+--------+-----+-------+
|_MICHD|_STATE| _BMI5|_RFHYPE5|TOLDHI2|CHOLCHK1|_FRTLT1A|_VEGLT1A|SMOKE100|_RFDRHV5|DIABETE3|CVDSTRK3|HLTHPLN1|MEDCOST|_TOTINDA|GENHLTH|PHYSHLTH|MENTHLTH|DIFFWALK|SEX|_AGEG5YR|EDUCA|INCOME2|
+------+------+------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+---+--------+-----+-------+
|   2.0|   1.0|2696.0|     2.0|    1.0|     2.0|     1.0|     1.0|     2.0|     1.0|     1.0|     2.0|     1.0|    2.0|     1.0|    2.0|    88.0|    88.0|     1.0|2.0|    11.0|  6.0|    6.0|
|   2.0|   1.0|2943.0|     2.0|    2.0|     2.0|     1.0|     1.0|     2.0|     1.0|     3.0|     2.0|     1.0|    2.0|     1.0|    2.0|    88.0|    88.0|     2.0|1.0|    10.0|  6.0|    8.0|
|   2.0|   1.0|2504.0|     1.0|    1.0|     2

## 3. REMOVE ROWS THAT HAVE NULL VALUES

In [14]:
print("Dimensions of the Data Frame:")
print((heartDisease2019.count(), len(heartDisease2019.columns)))

Dimensions of the Data Frame:
(418268, 23)


In [15]:
data2017=heartDisease2017.na.drop()
data2019=heartDisease2019.na.drop()

In [16]:
print("Dimensions of the Data Frame:")
print((data2019.count(), len(data2019.columns)))

Dimensions of the Data Frame:


[Stage 13:>                                                         (0 + 8) / 8]

(351867, 23)


                                                                                

## 4. MAPPING THE FEATURES APPROPRIATELY
- We will use our understanding of the code book to correctly map values 
- Survey of writes 7 or 9 to indicate the person did not know or refused to answer.
- We need to clean this values and map them so that the values are meaningful

In [17]:
data2019.printSchema()

root
 |-- _MICHD: double (nullable = true)
 |-- _STATE: double (nullable = true)
 |-- _BMI5: double (nullable = true)
 |-- _RFHYPE5: double (nullable = true)
 |-- TOLDHI2: double (nullable = true)
 |-- CHOLCHK2: double (nullable = true)
 |-- _FRTLT1A: double (nullable = true)
 |-- _VEGLT1A: double (nullable = true)
 |-- SMOKE100: double (nullable = true)
 |-- _RFDRHV7: double (nullable = true)
 |-- DIABETE4: double (nullable = true)
 |-- CVDSTRK3: double (nullable = true)
 |-- HLTHPLN1: double (nullable = true)
 |-- MEDCOST: double (nullable = true)
 |-- _TOTINDA: double (nullable = true)
 |-- GENHLTH: double (nullable = true)
 |-- PHYSHLTH: double (nullable = true)
 |-- MENTHLTH: double (nullable = true)
 |-- DIFFWALK: double (nullable = true)
 |-- SEXVAR: double (nullable = true)
 |-- _AGEG5YR: double (nullable = true)
 |-- EDUCA: double (nullable = true)
 |-- INCOME2: double (nullable = true)



In [18]:
print(type(data2019))

<class 'pyspark.sql.dataframe.DataFrame'>


In [19]:
# imports for pyspark SQL
from pyspark.sql.functions import when

#### 4.1 _MICHD
- 1: Yes has Heart Issues -->  1 
- 2: No has no heart issues --> 0
- Remove all 7 (dont knows)
- Remove all 9 (refused)

In [20]:
data2019 = data2019.filter((data2019["_MICHD"] == 2)| (data2019["_MICHD"] == 1))
data2019 = data2019.withColumn("_MICHD", when(data2019._MICHD == 2,0).otherwise(data2019._MICHD))

data2017 = data2017.filter((data2017["_MICHD"] == 2)| (data2017["_MICHD"] == 1))
data2017 = data2017.withColumn("_MICHD", when(data2017._MICHD == 2,0).otherwise(data2017._MICHD))

#### 4.3 _BMI5
- these are BMI * 100. So for example a BMI of 4018 is really 40.18
- 777 and 999 indicate did not answer or refused (filter these out)

In [21]:
data2019 = data2019.filter(data2019["_BMI5"] != 777)
data2019 = data2019.filter(data2019["_BMI5"] != 999)
data2019 = data2019.withColumn('_BMI5', data2019["_BMI5"]/100)

data2017 = data2017.filter(data2017["_BMI5"] != 777)
data2017 = data2017.filter(data2017["_BMI5"] != 999)
data2017 = data2017.withColumn('_BMI5', data2017["_BMI5"]/100)

#### 4.4 _RFHYPE5
- Change 1 to 0 so it represents No high blood pressure and 2 to 1 so it represents high blood pressure

In [22]:
data2019 = data2019.filter((data2019["_RFHYPE5"] == 2)| (data2019["_RFHYPE5"] == 1))
data2019 = data2019.withColumn("_RFHYPE5", when(data2019._RFHYPE5 == 1,0).when(data2019._RFHYPE5 == 2,1).otherwise(data2019._RFHYPE5))

data2017 = data2017.filter((data2017["_RFHYPE5"] == 2)| (data2017["_RFHYPE5"] == 1))
data2017 = data2017.withColumn("_RFHYPE5", when(data2017._RFHYPE5 == 1,0).when(data2017._RFHYPE5 == 2,1).otherwise(data2017._RFHYPE5))


In [25]:
data2017.show(5)

+------+------+-----+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+---+--------+-----+-------+
|_MICHD|_STATE|_BMI5|_RFHYPE5|TOLDHI2|CHOLCHK1|_FRTLT1A|_VEGLT1A|SMOKE100|_RFDRHV5|DIABETE3|CVDSTRK3|HLTHPLN1|MEDCOST|_TOTINDA|GENHLTH|PHYSHLTH|MENTHLTH|DIFFWALK|SEX|_AGEG5YR|EDUCA|INCOME2|
+------+------+-----+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+---+--------+-----+-------+
|   0.0|   1.0|26.96|     1.0|    1.0|     2.0|     1.0|     1.0|     2.0|     1.0|     1.0|     2.0|     1.0|    2.0|     1.0|    2.0|    88.0|    88.0|     1.0|2.0|    11.0|  6.0|    6.0|
|   0.0|   1.0|29.43|     1.0|    2.0|     2.0|     1.0|     1.0|     2.0|     1.0|     3.0|     2.0|     1.0|    2.0|     1.0|    2.0|    88.0|    88.0|     2.0|1.0|    10.0|  6.0|    8.0|
|   0.0|   1.0|25.04|     0.0|    1.0|     2.0|   

In [24]:

check.show(2)

NameError: name 'check' is not defined