In [32]:
# Import dependencies

import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np

In [33]:
# Import CSV File

heart_disease_df = pd.read_csv('Resources\heart_2020_cleaned.csv')

# Drop unneeded columns

heart_disease_df = heart_disease_df.drop(columns=['Stroke','PhysicalHealth','MentalHealth','DiffWalking',
                                                  'Race','Diabetic','GenHealth','Asthma','KidneyDisease',
                                                  'SkinCancer'])

# Rename remaining columns for clarity

renamed_df = heart_disease_df.rename(columns={'HeartDisease':'Heart Disease', 'AlcoholDrinking':'Alcohol Drinking', 
                                              'AgeCategory':'Age Category', 'PhysicalActivity':'Physical Activity', 
                                              'SleepTime':'Sleep Time'})

In [34]:
# View first 5 lines of data

renamed_df.head()

Unnamed: 0,Heart Disease,BMI,Smoking,Alcohol Drinking,Sex,Age Category,Physical Activity,Sleep Time
0,No,16.6,Yes,No,Female,55-59,Yes,5.0
1,No,20.34,No,No,Female,80 or older,Yes,7.0
2,No,26.58,Yes,No,Male,65-69,Yes,8.0
3,No,24.21,No,No,Female,75-79,No,6.0
4,No,23.71,No,No,Female,40-44,Yes,8.0


In [35]:
# Find the maximum and minimum BMI scores

print(renamed_df["BMI"].max())
print(renamed_df["BMI"].min())

94.85
12.02


In [36]:
# Create bins in which to place values based upon BMI value

bmi_bins=[0, 18.49, 24.99, 29.99, 34.99, 39.99, 95]

In [37]:
# Bin labels and corresponding BMI values were chosen based on CDC's definitions
# found at https://www.cdc.gov/obesity/basics/adult-defining.html

bmi_labels=['Underweight (less than 18.5)','Healthy Weight (18.5 to 24.9)','Overweight (25 to 29.9)',
            'Class 1 Obese (30 to 34.99)','Class 2 Obese (35 to 39.99)','Class 3 Severely Obese (40+)']

In [38]:
# Slice the data, place it in bins, and create a new column to contain the data series

renamed_df['BMI Category'] = pd.cut(renamed_df["BMI"], bmi_bins, labels=bmi_labels)
renamed_df.head()

Unnamed: 0,Heart Disease,BMI,Smoking,Alcohol Drinking,Sex,Age Category,Physical Activity,Sleep Time,BMI Category
0,No,16.6,Yes,No,Female,55-59,Yes,5.0,Underweight (less than 18.5)
1,No,20.34,No,No,Female,80 or older,Yes,7.0,Healthy Weight (18.5 to 24.9)
2,No,26.58,Yes,No,Male,65-69,Yes,8.0,Overweight (25 to 29.9)
3,No,24.21,No,No,Female,75-79,No,6.0,Healthy Weight (18.5 to 24.9)
4,No,23.71,No,No,Female,40-44,Yes,8.0,Healthy Weight (18.5 to 24.9)


In [49]:
# Count how many values are in each BMI category
bmi_group = renamed_df.groupby('BMI Category')['BMI Category'].count()
bmi_group

BMI Category
Underweight (less than 18.5)       5110
Healthy Weight (18.5 to 24.9)     97331
Overweight (25 to 29.9)          114512
Class 1 Obese (30 to 34.99)       61345
Class 2 Obese (35 to 39.99)       25112
Class 3 Severely Obese (40+)      16385
Name: BMI Category, dtype: int64