In [45]:
import numpy as np
import pandas as pd 
from sklearn import preprocessing

In [46]:
# load data into a Pandas DataFrame
df = pd.read_csv('cardio_train.csv', sep=';') 
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [47]:
# code in this cell taken from eda.ipynb
# remove outliers in ap_hi and ap_lo
cleaned_df = df[df['ap_hi'] < 250]  
cleaned_df = cleaned_df[cleaned_df['ap_lo'] < 250] 

print(cleaned_df.shape)

(69007, 13)


In [48]:
cleaned_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [49]:
# code in this cell taken from eda.ipynb
cont_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
ord_cols = ['cholesterol', 'gluc']

for col in ord_cols:
    dummies = pd.get_dummies(cleaned_df[col])
    dummies.columns = ['{0}_{1}'.format(col, ind) for ind in dummies.columns]
    cleaned_df = pd.concat([cleaned_df, dummies], axis=1)
    
cleaned_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,1,0,0,1,0,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,0,0,1,1,0,0
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,0,0,1,1,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,1,0,0,1,0,0
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,1,0,0,1,0,0


In [50]:
cleaned_df['gender'] -= 1  # convert gender values to be either 1 or 0
cleaned_df['age'] //= 356  # convert age values to be in years
cleaned_df = cleaned_df.drop(labels=['cholesterol', 'gluc', 'id'], axis=1)
cleaned_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,51,1,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0
1,56,0,156,85.0,140,90,0,0,1,1,0,0,1,1,0,0
2,52,0,165,64.0,130,70,0,0,0,1,0,0,1,1,0,0
3,49,1,169,82.0,150,100,0,0,1,1,1,0,0,1,0,0
4,49,0,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0


In [51]:
# calculate bmi and add it to the data
cleaned_df['bmi'] = cleaned_df['weight']/((cleaned_df['height']/100)**2)
cleaned_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,bmi
0,51,1,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0,21.96712
1,56,0,156,85.0,140,90,0,0,1,1,0,0,1,1,0,0,34.927679
2,52,0,165,64.0,130,70,0,0,0,1,0,0,1,1,0,0,23.507805
3,49,1,169,82.0,150,100,0,0,1,1,1,0,0,1,0,0,28.710479
4,49,0,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0,23.011177


In [52]:
# create categories for blood pressure based on value of ap_hi
labels = [0, 1, 2]  # 0:normal, 1:elevated bp, 2:hypertension
cleaned_df['bp'] = pd.cut(cleaned_df['ap_hi'], [0, 120, 130, 250], right=False, labels=labels)
cleaned_df.head(10)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,bmi,bp
0,51,1,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0,21.96712,0
1,56,0,156,85.0,140,90,0,0,1,1,0,0,1,1,0,0,34.927679,2
2,52,0,165,64.0,130,70,0,0,0,1,0,0,1,1,0,0,23.507805,2
3,49,1,169,82.0,150,100,0,0,1,1,1,0,0,1,0,0,28.710479,2
4,49,0,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0,23.011177,0
5,61,0,151,67.0,120,80,0,0,0,0,0,1,0,0,1,0,29.384676,1
6,62,0,157,93.0,130,80,0,0,1,0,0,0,1,1,0,0,37.729725,2
7,63,1,178,95.0,130,90,0,0,1,1,0,0,1,0,0,1,29.983588,2
8,49,0,158,71.0,110,70,0,0,1,0,1,0,0,1,0,0,28.440955,0
9,55,0,164,68.0,110,60,0,0,0,0,1,0,0,1,0,0,25.28257,0


In [53]:
# save un-normalized data
cleaned_df.to_csv('preprocessed_data.csv', index=False)

In [54]:
# normalize continuous variables
min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(cleaned_df[cont_cols])
df_normalized = pd.DataFrame(df_scaled)
df_normalized.columns = cont_cols
cols = ['gender', 'smoke', 'alco', 'active', 'cardio', 'cholesterol_1', 
        'cholesterol_2', 'cholesterol_3', 'gluc_1', 'gluc_2', 'bmi', 'bp']
df_normalized = pd.concat([df_normalized, cleaned_df[cols]], axis=1)
df_normalized.head(10)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,bmi,bp
0,0.583333,0.579487,0.269841,0.666667,0.576923,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,21.96712,0
1,0.722222,0.517949,0.391534,0.74359,0.615385,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,34.927679,2
2,0.611111,0.564103,0.280423,0.717949,0.538462,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,23.507805,2
3,0.527778,0.584615,0.375661,0.769231,0.653846,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,28.710479,2
4,0.527778,0.517949,0.238095,0.641026,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,23.011177,0
5,0.861111,0.492308,0.296296,0.692308,0.576923,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,29.384676,1
6,0.888889,0.523077,0.433862,0.717949,0.576923,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,37.729725,2
7,0.916667,0.630769,0.444444,0.717949,0.615385,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,29.983588,2
8,0.527778,0.528205,0.31746,0.666667,0.538462,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,28.440955,0
9,0.694444,0.558974,0.301587,0.666667,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,25.28257,0


In [55]:
# save normalized data
df_normalized.to_csv('normalized_data.csv', index=False)