In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [4]:
def remove_outliers(data):
  q1 = data.quantile(0.25)
  q3 = data.quantile(0.75)
  iqr = q3-q1
  lower_quartile = q1 - 1.5*iqr
  upper_quartile = q3 + 1.5*iqr
  outliers = data[~((data < (q1 - 1.5 * iqr)) | (data > (q3 + 1.5 * iqr))).any(axis=1)]
  count = ( ((data < (q1 - 1.5 * iqr)) | (data > (q3 + 1.5 * iqr) ))).sum()
  print(count)
  return outliers
def label_encoder(data, categorial_col):
  encode_data = data.copy()
  encoder = LabelEncoder()
  for column in categorial_col:
      column_data = data[[column]]
      encode_values = encoder.fit_transform(column_data.values.ravel())
      encode_data[column] = encode_values.astype('uint8')
  return encode_data
def label_encoder_bp(data):
    encode_data = data.copy()
    encoder = LabelEncoder()
    bp_category_mapping = {
        "Normal": 0,
        "Elevated": 1,
        "Hypertension Stage 1": 2,
        "Hypertension Stage 2": 3
    }
    encode_data['bp_category'] = encode_data['bp_category'].map(bp_category_mapping)
    return encode_data

In [7]:
data = pd.read_csv('Downloads/cardio_data_processed.csv')
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category,bp_category_encoded
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,Hypertension Stage 1,Hypertension Stage 1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,Hypertension Stage 2,Hypertension Stage 2
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805,Hypertension Stage 1,Hypertension Stage 1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,Hypertension Stage 2,Hypertension Stage 2
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177,Normal,Normal


In [8]:
data.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi
count,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0,68205.0
mean,49972.410498,19462.667737,1.348625,164.372861,74.100688,126.434924,81.263925,1.363243,1.225174,0.087662,0.053134,0.803548,0.493688,52.823635,27.510513
std,28852.13829,2468.381854,0.476539,8.176756,14.288862,15.961685,9.143985,0.67808,0.571288,0.282805,0.224302,0.397317,0.499964,6.769909,6.026497
min,0.0,10798.0,1.0,55.0,11.0,90.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0,29.0,3.471784
25%,24991.0,17656.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,48.0,23.875115
50%,50008.0,19700.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,53.0,26.346494
75%,74878.0,21323.0,2.0,170.0,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,58.0,30.116213
max,99999.0,23713.0,2.0,250.0,200.0,180.0,120.0,3.0,3.0,1.0,1.0,1.0,1.0,64.0,298.666667


# Preliminary Feature Selection
Firstly, we noticed that our dataset contains many variables that could relate to our question. Using all of these variables would not be feasible. Hence, we decided to select a subset of the variables and work with this subset.

The choice of the variables has been made carefully. Since our question relates to the lifestyle of middle aged people and how it affects their cardiovascular health, the variables we have chosen are:

Variables Relating to Lifestyle: smoke, alco, active, cardio

Variables Relating to Cardiovascular Health: cholesterol, gluc, bmi, bp_category

In [9]:
columns = ['id', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'age', 'bp_category_encoded']
data.drop(columns, inplace=True, axis=1)

data.head(n = 20)

data_filtered = data[(data['age_years'].astype(int) > 45)]

In [10]:
data_filtered.head(n=10)

Unnamed: 0,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category
0,1,1,0,0,1,0,50,21.96712,Hypertension Stage 1
1,3,1,0,0,1,1,55,34.927679,Hypertension Stage 2
2,3,1,0,0,0,1,51,23.507805,Hypertension Stage 1
3,1,1,0,0,1,1,48,28.710479,Hypertension Stage 2
4,1,1,0,0,0,0,47,23.011177,Normal
5,2,2,0,0,0,0,60,29.384676,Hypertension Stage 1
6,3,1,0,0,1,0,60,37.729725,Hypertension Stage 1
7,3,3,0,0,1,1,61,29.983588,Hypertension Stage 1
8,1,1,0,0,1,0,48,28.440955,Normal
9,1,1,0,0,0,0,54,25.28257,Normal


In [11]:
life_vars_df = data_filtered[[
    'smoke',
    'alco',
    'active',
    'cardio',
]]

health_vars_df = data_filtered[[
    'cholesterol',
    'gluc',
    'bmi',
    'bp_category'
]]

In [12]:
life_vars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56121 entries, 0 to 68204
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   smoke   56121 non-null  int64
 1   alco    56121 non-null  int64
 2   active  56121 non-null  int64
 3   cardio  56121 non-null  int64
dtypes: int64(4)
memory usage: 2.1 MB


In [13]:
health_vars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56121 entries, 0 to 68204
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cholesterol  56121 non-null  int64  
 1   gluc         56121 non-null  int64  
 2   bmi          56121 non-null  float64
 3   bp_category  56121 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 2.1+ MB


In [16]:
cat_encode = ['cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
data_filtered = label_encoder(data_filtered, cat_encode)

In [17]:
data_filtered = label_encoder_bp(data_filtered)

In [19]:
data_filtered

Unnamed: 0,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category
0,0,0,0,0,1,0,50,21.967120,2
1,2,0,0,0,1,1,55,34.927679,3
2,2,0,0,0,0,1,51,23.507805,2
3,0,0,0,0,1,1,48,28.710479,3
4,0,0,0,0,0,0,47,23.011177,0
...,...,...,...,...,...,...,...,...,...
68200,0,0,1,0,1,0,52,26.927438,2
68201,1,1,0,0,1,1,61,50.472681,3
68202,2,0,0,1,0,1,52,31.353579,3
68203,0,1,0,0,0,1,61,27.099251,2


- 0 is the Normal Heart Rate
- 1 is Elevated Heart Rate
- 2 is Hypertension 1
- 3 is Hypertension 2

We realised that there a few outliers inside the BMI distribution. We decide to cap the BMI distribution to < 40 and > 10. BMIs that are above 40 are considered to be super-morbid obese(SMO) and BMIs that are less than 10 are considered to be extremely underweight, which we figured would mostly consist of people who were so unhealthy that they were not only outliers in society but were too extreme as data points and would undoubtedly skew our results.

In [20]:
index = data_filtered.loc[(data_filtered['bmi'] >= 40) | (data_filtered['bmi'] < 10)].index
df_adj = data_filtered.drop(index)

In [21]:
print(df_adj.head())
df_adj.shape

   cholesterol  gluc  smoke  alco  active  cardio  age_years        bmi  \
0            0     0      0     0       1       0         50  21.967120   
1            2     0      0     0       1       1         55  34.927679   
2            2     0      0     0       0       1         51  23.507805   
3            0     0      0     0       1       1         48  28.710479   
4            0     0      0     0       0       0         47  23.011177   

   bp_category  
0            2  
1            3  
2            2  
3            3  
4            0  


(54584, 9)

In [22]:
df_adj.describe()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category
count,54584.0,54584.0,54584.0,54584.0,54584.0,54584.0,54584.0,54584.0,54584.0
mean,0.388502,0.238275,0.083871,0.050949,0.802763,0.52814,55.130166,27.179119,1.966748
std,0.699526,0.588829,0.277196,0.219896,0.397917,0.499212,4.949767,4.451838,0.869987
min,0.0,0.0,0.0,0.0,0.0,0.0,46.0,10.726644,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,51.0,23.875115,2.0
50%,0.0,0.0,0.0,0.0,1.0,1.0,55.0,26.395803,2.0
75%,1.0,0.0,0.0,0.0,1.0,1.0,59.0,30.043262,2.0
max,2.0,2.0,1.0,1.0,1.0,1.0,64.0,39.965649,3.0


In [23]:
df_adj.head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category
0,0,0,0,0,1,0,50,21.96712,2
1,2,0,0,0,1,1,55,34.927679,3
2,2,0,0,0,0,1,51,23.507805,2
3,0,0,0,0,1,1,48,28.710479,3
4,0,0,0,0,0,0,47,23.011177,0


In [24]:
df_adj.isnull().any()

cholesterol    False
gluc           False
smoke          False
alco           False
active         False
cardio         False
age_years      False
bmi            False
bp_category    False
dtype: bool