In [1]:
import pandas as pd
pd.set_option('display.max_columns', 30)
import numpy as np
import random

### Reading Data

In [4]:
df = pd.read_csv('heart_2020_cleaned.csv')

In [5]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [None]:
df.isnull().sum()

In [6]:
for col in df.columns:
    print(f'{col}: {df[col].unique()}')
    print()

HeartDisease: ['No' 'Yes']

BMI: [16.6  20.34 26.58 ... 62.42 51.46 46.56]

Smoking: ['Yes' 'No']

AlcoholDrinking: ['No' 'Yes']

Stroke: ['No' 'Yes']

PhysicalHealth: [ 3.  0. 20. 28.  6. 15.  5. 30.  7.  1.  2. 21.  4. 10. 14. 18.  8. 25.
 16. 29. 27. 17. 24. 12. 23. 26. 22. 19.  9. 13. 11.]

MentalHealth: [30.  0.  2.  5. 15.  8.  4.  3. 10. 14. 20.  1.  7. 24.  9. 28. 16. 12.
  6. 25. 17. 18. 21. 29. 22. 13. 23. 27. 26. 11. 19.]

DiffWalking: ['No' 'Yes']

Sex: ['Female' 'Male']

AgeCategory: ['55-59' '80 or older' '65-69' '75-79' '40-44' '70-74' '60-64' '50-54'
 '45-49' '18-24' '35-39' '30-34' '25-29']

Race: ['White' 'Black' 'Asian' 'American Indian/Alaskan Native' 'Other'
 'Hispanic']

Diabetic: ['Yes' 'No' 'No, borderline diabetes' 'Yes (during pregnancy)']

PhysicalActivity: ['Yes' 'No']

GenHealth: ['Very good' 'Fair' 'Good' 'Poor' 'Excellent']

SleepTime: [ 5.  7.  8.  6. 12.  4.  9. 10. 15.  3.  2.  1. 16. 18. 14. 20. 11. 13.
 17. 24. 19. 21. 22. 23.]

Asthma: ['Yes' 'No']


### Data Transformation

In [7]:
len(df['AgeCategory'])

319795

In [8]:
len(df['GenHealth'])

319795

In [9]:
gen_health_conversion = {"Excellent":5, 'Very good': 4, 'Good': 3, 'Fair': 2, 'Poor': 1}
gen_health_conversion

{'Excellent': 5, 'Very good': 4, 'Good': 3, 'Fair': 2, 'Poor': 1}

In [10]:
gen_health_conversion = {"Excellent": 5, 'Very good': 4, 'Good': 3, 'Fair': 2, 'Poor': 1}

def convert_age_and_gen_health(row):
    '''
    Converts AgeCategory string into random integer value between the given range, and Encodes GenHealth Data as well.
    Encoding used for GenHealth is 'Excellent': 5, 'Very good': 4, 'Good': 3, 'Fair': 2, 'Poor': 1
    '''
    age_range = row['AgeCategory']
    if age_range != '80 or older':
        min_age = int(age_range[0:2])
        max_age = int(age_range[3:])
    else:
        min_age = 80
        max_age = 95
    age = np.random.randint(min_age, max_age + 1)
    row['AgeCategory'] = age
    row['GenHealth'] = gen_health_conversion.get(row['GenHealth'], row['GenHealth'])
    
    
    return row

In [11]:
def transform_data(df):
    '''
    Applies data transformation techniques to the given DataFrame to convert all the Categorical Data into Nominal Data
    '''
    # Converting Age and genHealth
    df = df.apply(convert_age_and_gen_health, axis=1)
    
    # Label Encoding
    df['HeartDisease'] = df['HeartDisease'].replace(['No', 'Yes'], [0, 1])
    
    # Feature Encoding
    df['Smoking'] = df['Smoking'].replace(['No', 'Yes'], [0, 1])
    df['AlcoholDrinking'] = df['AlcoholDrinking'].replace(['No', 'Yes'], [0, 1])
    df['Stroke'] = df['Stroke'].replace(['No', 'Yes'], [0, 1])
    df['Asthma'] = df['Asthma'].replace(['No', 'Yes'], [0, 1])
    df['DiffWalking'] = df['DiffWalking'].replace(['No', 'Yes'], [0, 1])
    df['KidneyDisease'] = df['KidneyDisease'].replace(['No', 'Yes'], [0, 1])
    df['SkinCancer'] = df['SkinCancer'].replace(['No', 'Yes'], [0, 1])
    df['PhysicalActivity'] = df['PhysicalActivity'].replace(['No', 'Yes'], [0, 1])
    df['Sex'] = df['Sex'].replace(['Male', 'Female'], [0, 1])
    df = pd.get_dummies(df, columns=['Diabetic'], prefix='Diabetic')
    df = pd.get_dummies(df, columns=['Race'], prefix='Race')
    
    return df

In [12]:
df = transform_data(df)

In [13]:
df.rename(columns={"AgeCategory":"Age"}, inplace=True)

In [14]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Age,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,16.6,1,0,0,3.0,30.0,0,1,59,1,4,5.0,1,0,1,0,0,1,0,0,0,0,0,0,1
1,0,20.34,0,0,1,0.0,0.0,0,1,88,1,4,7.0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,0,26.58,1,0,0,20.0,30.0,0,0,67,1,2,8.0,1,0,0,0,0,1,0,0,0,0,0,0,1
3,0,24.21,0,0,0,0.0,0.0,0,1,77,0,3,6.0,0,0,1,1,0,0,0,0,0,0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,1,42,1,4,8.0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [15]:
df.dtypes

HeartDisease                             int64
BMI                                    float64
Smoking                                  int64
AlcoholDrinking                          int64
Stroke                                   int64
PhysicalHealth                         float64
MentalHealth                           float64
DiffWalking                              int64
Sex                                      int64
Age                                      int64
PhysicalActivity                         int64
GenHealth                                int64
SleepTime                              float64
Asthma                                   int64
KidneyDisease                            int64
SkinCancer                               int64
Diabetic_No                              uint8
Diabetic_No, borderline diabetes         uint8
Diabetic_Yes                             uint8
Diabetic_Yes (during pregnancy)          uint8
Race_American Indian/Alaskan Native      uint8
Race_Asian   

### Data Normalization

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 26 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   HeartDisease                         319795 non-null  int64  
 1   BMI                                  319795 non-null  float64
 2   Smoking                              319795 non-null  int64  
 3   AlcoholDrinking                      319795 non-null  int64  
 4   Stroke                               319795 non-null  int64  
 5   PhysicalHealth                       319795 non-null  float64
 6   MentalHealth                         319795 non-null  float64
 7   DiffWalking                          319795 non-null  int64  
 8   Sex                                  319795 non-null  int64  
 9   Age                                  319795 non-null  int64  
 10  PhysicalActivity                     319795 non-null  int64  
 11  GenHealth    

In [17]:
df.describe()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Age,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
count,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0
mean,0.085595,28.325399,0.412477,0.068097,0.03774,3.37171,3.898366,0.13887,0.524727,54.922822,0.775362,3.595028,7.097075,0.134061,0.036833,0.093244,0.843206,0.021204,0.127588,0.008002,0.016267,0.025229,0.07173,0.085824,0.034172,0.766779
std,0.279766,6.3561,0.492281,0.251912,0.190567,7.95085,7.955235,0.345812,0.499389,18.720205,0.417344,1.042918,1.436007,0.340718,0.188352,0.290775,0.363607,0.144065,0.333631,0.089095,0.126499,0.156819,0.258041,0.280104,0.181671,0.422883
min,0.0,12.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,24.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,1.0,3.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,27.34,0.0,0.0,0.0,0.0,0.0,0.0,1.0,57.0,1.0,4.0,7.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,31.42,1.0,0.0,0.0,2.0,3.0,0.0,1.0,69.0,1.0,4.0,8.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,94.85,1.0,1.0,1.0,30.0,30.0,1.0,1.0,95.0,1.0,5.0,24.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
# Max values of each column used for normalization
max_vector = df.max()

In [19]:
max_vector = max_vector.to_numpy()

In [20]:
max_vector

array([ 1.  , 94.85,  1.  ,  1.  ,  1.  , 30.  , 30.  ,  1.  ,  1.  ,
       95.  ,  1.  ,  5.  , 24.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,
        1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ])

In [21]:
def apply_normalization(row):
    '''
    Normalizes the given row so that all the features are between 0-1
    '''
    row = row / max_vector
    return row

In [22]:
df = df.apply(apply_normalization, axis = 1)

In [23]:
df.describe()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Age,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
count,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0
mean,0.085595,0.298634,0.412477,0.068097,0.03774,0.11239,0.129946,0.13887,0.524727,0.578135,0.775362,0.719006,0.295711,0.134061,0.036833,0.093244,0.843206,0.021204,0.127588,0.008002,0.016267,0.025229,0.07173,0.085824,0.034172,0.766779
std,0.279766,0.067012,0.492281,0.251912,0.190567,0.265028,0.265175,0.345812,0.499389,0.197055,0.417344,0.208584,0.059834,0.340718,0.188352,0.290775,0.363607,0.144065,0.333631,0.089095,0.126499,0.156819,0.258041,0.280104,0.181671,0.422883
min,0.0,0.126726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189474,0.0,0.2,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.253347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.421053,1.0,0.6,0.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.288245,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.6,1.0,0.8,0.291667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.33126,1.0,0.0,0.0,0.066667,0.1,0.0,1.0,0.726316,1.0,0.8,0.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
df.to_csv('Processed_Heart_Data.csv')