#**Data Exploration**

In this script, features within the intitial cvd dataset will be visualised to help gain further understanding of the dataset being worked with.

In [1]:
import os

# Find the latest version of spark 4.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.1'
spark_version = 'spark-3.4.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done


In [2]:
# Import dependencies
import requests
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import pyspark.sql.functions as F

from pyspark.sql.functions import floor
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql.functions import floor

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
# Retrieves in the CSV data from Google Sheets
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSDchXr1EhgCSsxlxJ3lWPhh1kT5EJS3yv4DJ2YLeMIC3y4uq-Pp4EQknrs9zAiaI3ulne2Jyi6gR6G/pub?gid=602879552&single=true&output=csv"
response = requests.get(url)

# Write the CSV data to a local file
with open("cvd.csv", "wb") as f:
    f.write(response.content)

# Read the local CSV file using Spark
cvd_df = spark.read.csv("cvd.csv", header=True, sep=",", inferSchema=True)

# Show DataFrame
cvd_df.show()

+--------------+--------------------+--------+-------------+-----------+------------+----------+--------+---------+------+------------+---------+---------+-----+---------------+-------------------+-----------------+----------------------------+-----------------------+
|general_health|             checkup|exercise|heart_disease|skin_cancer|other_cancer|depression|diabetes|arthritis|   sex|age_category|height_cm|weight_kg|  bmi|smoking_history|alcohol_consumption|fruit_consumption|green_vegetables_consumption|friedpotato_consumption|
+--------------+--------------------+--------+-------------+-----------+------------+----------+--------+---------+------+------------+---------+---------+-----+---------------+-------------------+-----------------+----------------------------+-----------------------+
|          Poor|Within the past 2...|      No|           No|         No|          No|        No|      No|      Yes|Female|       70-74|      150|    32.66|14.54|            Yes|                

In [4]:
# Check data types
cvd_df.describe()

DataFrame[summary: string, general_health: string, checkup: string, exercise: string, heart_disease: string, skin_cancer: string, other_cancer: string, depression: string, diabetes: string, arthritis: string, sex: string, age_category: string, height_cm: string, weight_kg: string, bmi: string, smoking_history: string, alcohol_consumption: string, fruit_consumption: string, green_vegetables_consumption: string, friedpotato_consumption: string]

Any numeric columns will be converted to their respective data type as all values from the CSV were read in as a string.

In [5]:
# Convert data types
cvd_df = cvd_df.withColumn("height_cm", col("height_cm").cast(IntegerType()))
cvd_df = cvd_df.withColumn("weight_kg", col("weight_kg").cast(FloatType()))
cvd_df = cvd_df.withColumn("bmi", col("bmi").cast(FloatType()))
cvd_df = cvd_df.withColumn("alcohol_consumption", col("alcohol_consumption").cast(IntegerType()))
cvd_df = cvd_df.withColumn("fruit_consumption", col("fruit_consumption").cast(IntegerType()))
cvd_df = cvd_df.withColumn("green_vegetables_consumption", col("green_vegetables_consumption").cast(IntegerType()))
cvd_df = cvd_df.withColumn("friedpotato_consumption", col("friedpotato_consumption").cast(IntegerType()))

In [6]:
# Create temporary view
cvd_df.createOrReplaceTempView('cvd')

### **Heart Disease vs Age**

In [7]:
# Spark SQL query to calculate Heart Disease Count by Age
query = """
SELECT
  age_category,
  COUNT(*) AS total_count,
  SUM(
    CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
    ) AS heart_disease_count
FROM
  cvd
GROUP BY
  age_category
ORDER BY
  age_category
"""

# Execute the SQL query and store the result in a Spark DataFrame
age_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
age_pandas_df = age_df.toPandas()

# Add a calculated column which computes the percentage of the age group with heart disease
age_pandas_df['heart_disease_percentage'] = ((age_pandas_df['heart_disease_count'] / age_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
age_pandas_df

Unnamed: 0,age_category,total_count,heart_disease_count,heart_disease_percentage
0,18-24,18474,93,0.5
1,25-29,15196,113,0.74
2,30-34,17963,193,1.07
3,35-39,19913,258,1.3
4,40-44,20857,412,1.98
5,45-49,20295,651,3.21
6,50-54,24259,1118,4.61
7,55-59,27134,1913,7.05
8,60-64,31268,2893,9.25
9,65-69,32321,3691,11.42


In [8]:
# Create an interactive bar chart to visualise the total count of each age group in the dataset using Plotly
fig = px.bar(
             age_pandas_df,
             x='age_category',
             y='total_count',
             title='<b>Total Count by Age Category</b>',
             labels={'age_category': 'Age Category', 'total_count': 'Total Count'},
             color='total_count',
             color_continuous_scale='Reds',
             range_color=[min(age_pandas_df['total_count']), max(age_pandas_df['total_count'])],
             template='simple_white'
            )

# Customise bar chart appearance
fig.update_layout(
    title_font=dict(color='black', size=28),
    coloraxis_colorbar=dict(title_font=dict(color='black'))
)

# Show the bar chart
fig.show()

There are more 65-69 year olds in our dataset compared to any other age group, this represents an aging population. However, there is still a respectable distibution but it should just be noted that this dataset is slightly left skewed.

In [9]:
# Create an interactive bar chart to visualise the heart disease count of each age group in the dataset using Plotly
fig = px.bar(
             age_pandas_df,
             x='age_category',
             y='heart_disease_count',
             title='<b>Heart Disease Count by Age Category</b>',
             labels={'age_category': 'Age Category', 'heart_disease_count': 'Heart Disease Count'},
             color='heart_disease_count',
             color_continuous_scale='Reds',
             range_color=[min(age_pandas_df['heart_disease_count']), max(age_pandas_df['heart_disease_count'])],
             template='simple_white'
            )

# Customise bar chart appearance
fig.update_layout(
                  title_font=dict(color='black', size=28),
                  coloraxis_colorbar=dict(title_font=dict(color='black'))
                 )

# Show the bar chart
fig.show()

Although there were more 65-69 year olds in the dataset, those who were 80+ were most likely to have a heart disease. There is also a drop in the number of 75-79 year olds in our dataset who have a heart disease in comparison with neighboring age categories but this could be down to the number 75-79 year olds in our data and therefore, the heart disease prevelance in each age group would help provide further insight.

In [10]:
# Create an interactive line chart to visualise Heart Disease Prevalence by Age using Plotly
fig = px.line(
              age_pandas_df,
              x='age_category',
              y='heart_disease_percentage',
              title='<b>Heart Disease Prevalence by Age</b>',
              labels={'age_category': 'Age Category', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
              markers=True,
              template='simple_white',
              )

# Customise the line chart appearance
fig.update_traces(line=dict(color='#8D021F'))
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the line chart
fig.show()

The older an individual is, the proportion of their age group that has a heart disease increases. This trend increases more exponentially after the age of 40.

### **Heart Disease vs Sex**

In [11]:
# Spark SQL query to calculate heart disease breakdown by sex
query = """
SELECT
  sex,
  COUNT(*) AS total_count,
  SUM(
    CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
    ) AS heart_disease_count
FROM
  cvd
GROUP BY
  sex
ORDER BY
  sex
"""

# Execute the SQL query and store the result in a Spark DataFrame
sex_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
sex_pandas_df = sex_df.toPandas()

# Add a calculated column which computes the percentage of the sex group with heart disease
sex_pandas_df['heart_disease_percentage'] = ((sex_pandas_df['heart_disease_count'] / sex_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
sex_pandas_df

Unnamed: 0,sex,total_count,heart_disease_count,heart_disease_percentage
0,Female,153867,9492,6.17
1,Male,145445,14589,10.03


In [12]:
# Create an interactive bar chart to visualise the total count of each sex in the dataset using Plotly
fig = px.bar(
            sex_pandas_df,
            x='sex',
            y='total_count',
            title='<b>Total Count by Sex</b>',
            labels={'sex': 'Sex', 'total_count': 'Total Count'},
            color='sex',
            color_discrete_sequence=['#CD5C5C', '#8D021F'],
            template='simple_white'
            )

# Customise chart appearance
fig.update_layout(
    title_font=dict(color='black', size=28),
)

# Show the bar chart
fig.show()

There were marginally more females in our dataset than males.

In [13]:
# Create an interactive bar chart to visualise the heart disease count of each sex in the dataset using Plotly
fig = px.pie(
             sex_pandas_df,
             names='sex',
             values='heart_disease_count',
             title='<b>Heart Disease Breakdown by Sex</b>',
             color_discrete_sequence=['#8D021F', '#CD5C5C'],
             template='simple_white'
            )

# Customise the chart appearance
fig.update_traces(textinfo='percent+label', pull=[0.1, 0], hole=0.3)
fig.update_layout(title_font=dict(size=28))

# Show the pie chart
fig.show()

Of all individuals who have heart disease in our dataset, males are 1.5x more likely to be at risk.

In [14]:
# Create an interactive line chart to visualise Heart Disease Prevalence by Sex using Plotly
fig = px.bar(
            sex_pandas_df,
            x='sex',
            y='heart_disease_percentage',
            title='<b>Heart Disease Prevalence by Sex</b>',
            labels={'sex': 'Sex', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
            color='sex',
            color_discrete_sequence=['#CD5C5C', '#8D021F'],
            template='simple_white',
            )

# Customise the appearance of the bar chart
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the bar chart
fig.show()


A higher proportion of the males sex group had a heart disease, this is in line with the prior charts due to the distribution between the sex groups being fairly even.

### **Heart Disease vs General Health over Lifetime**

In [15]:
# Spark SQL query to calculate Heart Disease Count by General Health
query = """
SELECT
  general_health,
  COUNT(*) AS total_count,
  SUM(
    CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
    ) AS heart_disease_count
FROM
  cvd
GROUP BY
  general_health
ORDER BY
  general_health
"""

# Execute the SQL query and store the result in a Spark DataFrame
health_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
health_pandas_df = health_df.toPandas()

# Add a calculated column which computes the percentage of the age group with heart disease
health_pandas_df['heart_disease_percentage'] = ((health_pandas_df['heart_disease_count'] / health_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
health_pandas_df

Unnamed: 0,general_health,total_count,heart_disease_count,heart_disease_percentage
0,Excellent,55002,1084,1.97
1,Fair,34228,6532,19.08
2,Good,91775,8338,9.09
3,Poor,10858,3451,31.78
4,Very Good,107449,4676,4.35


In [16]:
# Create an interactive bar chart to visualise the total count of each general health group in the dataset using Plotly
fig = px.bar(
             health_pandas_df,
             x='general_health',
             y='total_count',
             title='<b>Total Count by General Health over Lifetime</b>',
             labels={'general_health': 'General Health', 'total_count': 'Total Count'},
             color='total_count',
             color_continuous_scale='Reds',
             range_color=[min( health_pandas_df['total_count']), max( health_pandas_df['total_count'])],
             template='simple_white',
             category_orders={"general_health": ["Poor", "Fair", "Good", "Very Good", "Excellent"]}
            )

# Customise bar chart appearance
fig.update_layout(
    title_font=dict(color='black', size=28),
    coloraxis_colorbar=dict(title_font=dict(color='black'))
)

# Show the bar chart
fig.show()

Most individuals in our dataset had a very good health over their life time whist very few has a poor overall health.

In [17]:
# Create an interactive bar chart to visualise the heart disease count of each general health group in the dataset using Plotly
fig = px.bar(
             health_pandas_df,
             x='general_health',
             y='heart_disease_count',
             title='<b>Heart Disease Count by General Health</b>',
             labels={'general_health': 'General Health', 'heart_disease_count': 'Heart Disease Count'},
             color='heart_disease_count',
             color_continuous_scale='Reds',
             range_color=[min(age_pandas_df['heart_disease_count']), max(age_pandas_df['heart_disease_count'])],
             category_orders={"general_health": ["Poor", "Fair", "Good", "Very Good", "Excellent"]},
             template='simple_white'
            )

# Customise bar chart appearance
fig.update_layout(
                  title_font=dict(color='black', size=28),
                  coloraxis_colorbar=dict(title_font=dict(color='black'))
                 )

# Show the bar chart
fig.show()

Most individuals in our dataset who had heart disease had a good or fair health over the course of their lifetime. The number of cases of individuals with a poor health not having as many heart disease could be due to their sample size within our dataset which is evident in the prior general health breakdown within our dataset.

In [18]:
# Create an interactive line chart to visualise Heart Disease Prevalence by Sex using Plotly
fig = px.bar(
            health_pandas_df,
            x='general_health',
            y='heart_disease_percentage',
            title='<b>Heart Disease Prevalence by General Health over Lifetime</b>',
            labels={'general_health': 'General Health', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
            color='general_health',
            color_discrete_sequence=['#CD5C5C', '#8D021F'],
            category_orders={"general_health": ["Poor", "Fair", "Good", "Very Good", "Excellent"]},
            template='simple_white',
            )

# Customise the appearance of the bar chart
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the bar chart
fig.show()

The prevalence has concluded that the more depleated an individuals health has been over their life time, the more likely they are to have a heart disease.

### **Heart Disease vs BMI**

In [19]:
# Convert BMI column in the Spark cvd DataFrame to a Pandas DataFrame
bmi_df_pandas = cvd_df.select("bmi").toPandas()

# Create an interactive box plot that shows the distribution of BMI using Plotly
fig = px.box(bmi_df_pandas, y="bmi", template="simple_white", color_discrete_sequence=['#FF0000'])

# Customise the box plot appearance
fig.update_layout(
    title={'text': "<b>BMI Distribution Box Plot</b>", 'font': {'size': 28}}
)

# Show the plot
fig.show()

Using the median as a measure of central tendancy, the average BMI within our dataset was 27.44. This is the best way to describe the average BMI in this case because there are quite a few outliers which are shown above [Q3+(1.5xIQR)], this would disturb the mean.

In [20]:
# Spark SQL query to calculate Heart Disease Count by BMI intervals of two
query = """
SELECT
  CONCAT(FLOOR(bmi / 2) * 2, '-', (FLOOR(bmi / 2) * 2) + 2) AS bmi_interval,
  COUNT(*) AS total_count,
  SUM(
     CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
     ) AS heart_disease_count
FROM
  cvd
GROUP BY
  bmi_interval
ORDER BY
  bmi_interval
"""

# Execute the SQL query and store the result in a Spark DataFrame
bmi_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
bmi_pandas_df = bmi_df.toPandas()

# Add a calculated column which computes the percentage of the bmi interval with heart disease
bmi_pandas_df['heart_disease_percentage'] = ((bmi_pandas_df['heart_disease_count'] / bmi_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
bmi_pandas_df.head()

Unnamed: 0,bmi_interval,total_count,heart_disease_count,heart_disease_percentage
0,12-14,94,8,8.51
1,14-16,443,40,9.03
2,16-18,2443,191,7.82
3,18-20,9807,547,5.58
4,20-22,23179,1217,5.25


In [21]:
# Create an interactive bar chart to visualise the heart disease count of each defined bmi interval using Plotly
fig = px.bar(
             bmi_pandas_df,
             x='bmi_interval',
             y='heart_disease_count',
             title='<b>Heart Disease Count by BMI Interval</b>',
             labels={'bmi_interval': 'BMI Interval', 'heart_disease_count': 'Heart Disease Count'},
             color='heart_disease_count',
             color_continuous_scale='reds',
             template='simple_white'
            )

# Customise the bar chart appearance
fig.update_layout(
                  xaxis_title='BMI Interval',
                  yaxis_title='Heart Disease Count',
                  xaxis={'categoryorder': 'array', 'categoryarray': sorted(bmi_pandas_df['bmi_interval'])},
                  title_font=dict(color='black', size=28),
                  )

# Show the bar chart
fig.show()

The distribution of bmi for those with a heart disease follows a normal distribution.

In [22]:
# Create an interactive line chart to visualise Heart Disease Prevalence by BMI Interval using Plotly
fig = px.line(
              bmi_pandas_df,
              x='bmi_interval',
              y='heart_disease_percentage',
              title='<b>Heart Disease Prevalence by BMI Interval</b>',
              labels={'bmi_interval': 'BMI Interval', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
              markers=True,
              template='simple_white',
              )

# Customise the line chart appearance
fig.update_traces(line=dict(color='#8D021F'))
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the line chart
fig.show()

Out of all BMI intervals, a BMI of 14-16 seems to imply that a individual of this BMI is at risk of heart disease. Additionally, after a BMI of 26-28 the people having a heart disease within their respective bmi interval seems to steadily increase.

There are big trend line patterns beyond a BMI of 56-58 which should be be observed but also, these BMI's are very rare as displayed from the number of indiiduals in our dataset whom have these BMI's.

### **Heart Disease vs Exercise**

In [23]:
# Spark SQL query to calculate heart disease breakdown by gender
query = """
SELECT
  exercise,
  COUNT(*) AS total_count,
  SUM(
    CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
    ) AS heart_disease_count
FROM
  cvd
GROUP BY
  exercise
ORDER BY
  exercise
"""

# Execute the SQL query and store the result in a Spark DataFrame
exercise_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
exercise_pandas_df = exercise_df.toPandas()

# Add a calculated column which computes the percentage of the sex group with heart disease
exercise_pandas_df['heart_disease_percentage'] = ((exercise_pandas_df['heart_disease_count'] / exercise_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
exercise_pandas_df

Unnamed: 0,exercise,total_count,heart_disease_count,heart_disease_percentage
0,No,66900,8689,12.99
1,Yes,232412,15392,6.62


In [24]:
# Create an interactive bar chart to visualise how many individuals in our dataset exercise regularly using Plotly
fig = px.bar(
            exercise_pandas_df,
            x='exercise',
            y='total_count',
            title='<b>Total Count by Exercise</b>',
            labels={'exercise': 'Exercise', 'total_count': 'Total Count'},
            color='exercise',
            color_discrete_sequence=['#CD5C5C', '#8D021F'],
            template='simple_white'
            )

# Customise chart appearance
fig.update_layout(
    title_font=dict(color='black', size=28),
)

# Show the bar chart
fig.show()

Most individuals in the dataset have exercised regularly in their lifetime.

In [25]:
# Create an interactive bar chart to visualise the heart disease count by exercise in the dataset using Plotly
fig = px.pie(
             exercise_pandas_df,
             names='exercise',
             values='heart_disease_count',
             title='<b>Heart Disease Breakdown by Exercise</b>',
             color_discrete_sequence=['#8D021F', '#CD5C5C'],
             template='simple_white'
            )

# Customise the chart appearance
fig.update_traces(textinfo='percent+label', pull=[0.1, 0], hole=0.3)
fig.update_layout(title_font=dict(size=28))

# Show the pie chart
fig.show()

Nearly double the amount of heart disease cases in our dataset exercised regularly in their lifetime.

In [26]:
# Create an interactive line chart to visualise Heart Disease Prevalence by Exercise using Plotly
fig = px.bar(
            exercise_pandas_df,
            x='exercise',
            y='heart_disease_percentage',
            title='<b>Heart Disease Prevalence by Exercise</b>',
            labels={'exercise': 'Exercise', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
            color='exercise',
            color_discrete_sequence=['#CD5C5C', '#8D021F'],
            template='simple_white',
            )

# Customise the appearance of the bar chart
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the bar chart
fig.show()

The results conclude that if you did not exercise, you were more likely to have a heart disease.

### **Heart Disease vs Smoking History**

In [27]:
# Spark SQL query to calculate heart disease breakdown by smoking history
query = """
SELECT
  smoking_history,
  COUNT(*) AS total_count,
  SUM(
    CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
    ) AS heart_disease_count
FROM
  cvd
GROUP BY
  smoking_history
ORDER BY
  smoking_history
"""

# Execute the SQL query and store the result in a Spark DataFrame
smoking_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
smoking_pandas_df = smoking_df.toPandas()

# Add a calculated column which computes the percentage of the smoking history group with heart disease
smoking_pandas_df['heart_disease_percentage'] = ((smoking_pandas_df['heart_disease_count'] / smoking_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
smoking_pandas_df

Unnamed: 0,smoking_history,total_count,heart_disease_count,heart_disease_percentage
0,No,178101,10006,5.62
1,Yes,121211,14075,11.61


In [28]:
# Create an interactive bar chart to visualise the smoking history population in our dataset using Plotly
fig = px.bar(
            smoking_pandas_df,
            x='smoking_history',
            y='total_count',
            title='<b>Total Count by Smoking History</b>',
            labels={'smoking_history': 'Smoking History', 'total_count': 'Total Count'},
            color='smoking_history',
            color_discrete_sequence=['#CD5C5C', '#8D021F'],
            template='simple_white'
            )

# Customise chart appearance
fig.update_layout(
    title_font=dict(color='black', size=28),
)

# Show the bar chart
fig.show()

There are more people with no smoking history in our dataset than people who have had a smoking history.

In [29]:
# Create an interactive bar chart to visualise the heart disease count by smoking history in the dataset using Plotly
fig = px.pie(
             smoking_pandas_df,
             names='smoking_history',
             values='heart_disease_count',
             title='<b>Heart Disease Breakdown by Smoking History</b>',
             color_discrete_sequence=['#8D021F', '#CD5C5C'],
             template='simple_white'
            )

# Customise the chart appearance
fig.update_traces(textinfo='percent+label', pull=[0.1, 0], hole=0.3)
fig.update_layout(title_font=dict(size=28))

# Show the pie chart
fig.show()

More individuals who had a smoking history had a heart disease within our dataset.


In [30]:
# Create an interactive line chart to visualise Heart Disease Prevalence by Smoking History using Plotly
fig = px.bar(
            smoking_pandas_df,
            x='smoking_history',
            y='heart_disease_percentage',
            title='<b>Heart Disease Prevalence by Smoking History</b>',
            labels={'smoking_history': 'Smoking History', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
            color='smoking_history',
            color_discrete_sequence=['#CD5C5C', '#8D021F'],
            template='simple_white',
            )

# Customise the appearance of the bar chart
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the bar chart
fig.show()

The prevelance of having a heart disease is much more likely in the instance where a individual has had a smoking history.

### **Heart Disease vs Alcohol Consumption**

In [31]:
# Convert Alchohol Consumption column in the Spark cvd DataFrame to a Pandas DataFrame
alcohol_df_pandas = cvd_df.select("alcohol_consumption").toPandas()

# Create an interactive box plot that shows the distribution of alchohol consumption using Plotly
fig = px.box(alcohol_df_pandas, y="alcohol_consumption", template="simple_white", color_discrete_sequence=['#FF0000'])

# Customise the box plot appearance
fig.update_layout(
    title={'text': "<b>Alcohol Consumption Distribution Box Plot</b>", 'font': {'size': 28}}
)

# Show the plot
fig.show()

On average, there were many more individuals in the dataset who consumed less alchohol.

In [32]:
# Spark SQL query to calculate Heart Disease Count by Alcohol Consumption intervals of two
query = """
SELECT
  CONCAT(FLOOR(alcohol_consumption / 2) * 2, ' to ', (FLOOR(alcohol_consumption / 2) * 2) + 2) AS alcohol_consumption_interval,
  COUNT(*) AS total_count,
  SUM(
     CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
  ) AS heart_disease_count
FROM
  cvd
GROUP BY
  alcohol_consumption_interval
ORDER BY
  alcohol_consumption_interval
"""

# Execute the SQL query and store the result in a Spark DataFrame
alcohol_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
alcohol_pandas_df = alcohol_df.toPandas()

# Add a calculated column which computes the percentage of the alcohol consumption interval with heart disease
alcohol_pandas_df['heart_disease_percentage'] = ((alcohol_pandas_df['heart_disease_count'] / alcohol_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
alcohol_pandas_df.head()

Unnamed: 0,alcohol_consumption_interval,total_count,heart_disease_count,heart_disease_percentage
0,0 to 2,159404,16015,10.05
1,10 to 12,7751,373,4.81
2,12 to 14,8688,438,5.04
3,14 to 16,6522,303,4.65
4,16 to 18,3601,181,5.03


In [33]:
# Create an interactive bar chart to visualise the heart disease count of each defined alcohol consumption interval using Plotly
fig = px.bar(
             alcohol_pandas_df,
             x='alcohol_consumption_interval',
             y='heart_disease_count',
             title='<b>Heart Disease Count by Alcohol Consumption Interval</b>',
             labels={'alcohol_consumption_interval': 'Alcohol Comsumption Interval', 'heart_disease_count': 'Heart Disease Count'},
             color='heart_disease_count',
             color_continuous_scale='reds',
             template='simple_white'
            )

# Customise the bar chart appearance
fig.update_layout(
                  xaxis_title='Alcohol Consumption Interval',
                  yaxis_title='Heart Disease Count',
                  xaxis={'categoryorder': 'array', 'categoryarray': sorted(alcohol_pandas_df['alcohol_consumption_interval'])},
                  title_font=dict(color='black', size=28),
                  )

# Show the bar chart
fig.show()

Most individuals who had a heart disease in our dataset, drank no alchohol.

In [34]:
# Create an interactive line chart to visualise Heart Disease Prevalence by BMI Interval using Plotly
fig = px.line(
              alcohol_pandas_df,
              x='alcohol_consumption_interval',
              y='heart_disease_percentage',
              title='<b>Heart Disease Prevalence by Alcohol Consumption Interval</b>',
              labels={'alcohol_consumption_interval': 'Alcohol Consumption Interval', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
              markers=True,
              template='simple_white',
              )

# Customise the line chart appearance
fig.update_traces(line=dict(color='#8D021F'))
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the line chart
fig.show()

### **Heart Disease vs Fruit Consumption**

In [35]:
# Convert Fruit Consumption column in the Spark cvd DataFrame to a Pandas DataFrame
fruit_df_pandas = cvd_df.select("fruit_consumption").toPandas()

# Create an interactive box plot that shows the distribution of fruit consumption using Plotly
fig = px.box(fruit_df_pandas, y="fruit_consumption", template="simple_white", color_discrete_sequence=['#FF0000'])

# Customise the box plot appearance
fig.update_layout(
    title={'text': "<b>Fruit Consumption Distribution Box Plot</b>", 'font': {'size': 28}}
)

# Show the plot
fig.show()

The average fruit consumption in the dataset is 30.

In [36]:
# Spark SQL query to calculate Heart Disease Count by Fruit Consumption intervals of two
query = """
SELECT
  CONCAT(FLOOR(fruit_consumption / 2) * 2, ' to ', (FLOOR(fruit_consumption / 2) * 2) + 2) AS fruit_consumption_interval,
  COUNT(*) AS total_count,
  SUM(
     CASE WHEN heart_disease = 'Yes' THEN 1 ELSE 0 END
     ) AS heart_disease_count
FROM
  cvd
GROUP BY
  fruit_consumption_interval
ORDER BY
  fruit_consumption_interval
"""

# Execute the SQL query and store the result in a Spark DataFrame
fruit_df = spark.sql(query)

# Convert the Spark DataFrame to a Pandas DataFrame for plotting
fruit_pandas_df = fruit_df.toPandas()

# Add a calculated column which computes the percentage of the bmi interval with heart disease
fruit_pandas_df['heart_disease_percentage'] = ((fruit_pandas_df['heart_disease_count'] / fruit_pandas_df['total_count']) * 100).round(2)

# Display the dataframe
fruit_pandas_df.head()

Unnamed: 0,fruit_consumption_interval,total_count,heart_disease_count,heart_disease_percentage
0,0 to 2,11309,1157,10.23
1,10 to 12,4348,418,9.61
2,100 to 102,14,1,7.14
3,104 to 106,1,0,0.0
4,108 to 110,1,0,0.0


In [37]:
# Create an interactive bar chart to visualise the heart disease count of each defined fruit consumption interval using Plotly
fig = px.bar(
             fruit_pandas_df,
             x='fruit_consumption_interval',
             y='heart_disease_count',
             title='<b>Heart Disease Count by Fruit Consumption Interval</b>',
             labels={'fruit_consumption_interval': 'Fruit Consumption Interval', 'heart_disease_count': 'Heart Disease Count'},
             color='heart_disease_count',
             color_continuous_scale='reds',
             template='simple_white'
            )

# Customise the bar chart appearance
fig.update_layout(
                  xaxis_title='Fruit Consumption Interval',
                  yaxis_title='Heart Disease Count',
                  xaxis={'categoryorder': 'array', 'categoryarray': sorted(fruit_pandas_df['fruit_consumption_interval'])},
                  title_font=dict(color='black', size=28),
                  )

# Show the bar chart
fig.show()

Indiviuals who ate 30 to 32 fruits monthly had the highest frequency of heart disease cases in our dataset.

In [38]:
# Create an interactive line chart to visualise Heart Disease Prevalence by Fruit Consumption Interval using Plotly
fig = px.line(
              fruit_pandas_df,
              x='fruit_consumption_interval',
              y='heart_disease_percentage',
              title='<b>Heart Disease Prevalence by Fruit Consumption Interval</b>',
              labels={'fruit_consumption_interval': 'Fruit Consumption Interval', 'heart_disease_percentage': 'Heart Disease Prevalence (%)'},
              markers=True,
              template='simple_white',
              )

# Customise the line chart appearance
fig.update_traces(line=dict(color='#8D021F'))
fig.update_layout(title_font=dict(size=28), xaxis_title_font=dict(size=16), yaxis_title_font=dict(size=16), xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12))

# Show the line chart
fig.show()

Of all fruit consumption interval, a fruit consumption between 52-54 were at most risk of having a heart disease.