In [1]:
%load_ext lab_black
import pandas as pd
import numpy as np

In [58]:
cardio = pd.read_csv("utils/cardio.csv", delimiter=";")
cardio

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [59]:
print(cardio.shape)
print(cardio.dtypes)

(70000, 13)
id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object


In [60]:
# Changing AGE from days format to year format
def age_years(age):
    ages = age / 365
    return ages


cardio["age"] = cardio["age"].apply(age_years)
# round it up into a whole number uaing numpy.ceil
cardio["age"] = cardio["age"].astype(float).apply(np.ceil)
cardio["age"] = cardio["age"].astype(int)

In [61]:
# converting all qualitative variables into object data types
cardio[["gender", "cholesterol", "gluc", "smoke", "alco", "active", "cardio"]] = cardio[
    ["gender", "cholesterol", "gluc", "smoke", "alco", "active", "cardio"]
].astype(str)

In [62]:
# Renaming some of the column names to proper naming conventions
cardio.rename(
    columns={
        "ap_hi": "systolic",
        "ap_lo": "diastolic",
        "gluc": "glucose",
        "alco": "alcohol_intake",
        "active": "physical_activity",
        "cardio": "cv_disease",
    },
    inplace=True,
)

In [63]:
# Replacing values in column with corresponding variable names (1 = Female, 2 = Male)
cardio["gender"] = cardio["gender"].replace({"1": "Female", "2": "Male"}, inplace=False)
cardio["cholesterol"] = cardio["cholesterol"].replace(
    {"1": "Normal", "2": "High", "3": "Extremely High"}, inplace=False
)
cardio["glucose"] = cardio["glucose"].replace(
    {"1": "Normal", "2": "High", "3": "Extremely High"}, inplace=False
)
cardio["smoke"] = cardio["smoke"].replace({"1": True, "0": False}, inplace=False)
cardio["alcohol_intake"] = cardio["alcohol_intake"].replace(
    {"1": True, "0": False}, inplace=False
)
cardio["physical_activity"] = cardio["physical_activity"].replace(
    {"1": True, "0": False}, inplace=False
)
cardio["cv_disease"] = cardio["cv_disease"].replace(
    {"1": True, "0": False}, inplace=False
)

In [64]:
cardio["smoke"] = cardio["smoke"].astype(bool)
cardio["alcohol_intake"] = cardio["alcohol_intake"].astype(bool)
cardio["physical_activity"] = cardio["physical_activity"].astype(bool)
cardio["cv_disease"] = cardio["cv_disease"].astype(bool)

In [65]:
# Adding a new column 'BMI' to the data frame
cardio["bmi"] = cardio["weight"] / (cardio["height"] ** 2) * 10000
cardio["bmi"] = cardio["bmi"].round(decimals=1)

In [66]:
# convert categorical ordinal columns into CategoricalDataType
var_dict = {
    "cholesterol": ["Normal", "High", "Extremely High"],
    "glucose": ["Normal", "High", "Extremely High"],
}
for var in var_dict:
    ordered_var = pd.api.types.CategoricalDtype(ordered=True, categories=var_dict[var])
    cardio[var] = cardio[var].astype(ordered_var)

In [67]:
cardio

Unnamed: 0,id,age,gender,height,weight,systolic,diastolic,cholesterol,glucose,smoke,alcohol_intake,physical_activity,cv_disease,bmi
0,0,51,Male,168,62.0,110,80,Normal,Normal,False,False,True,False,22.0
1,1,56,Female,156,85.0,140,90,Extremely High,Normal,False,False,True,True,34.9
2,2,52,Female,165,64.0,130,70,Extremely High,Normal,False,False,False,True,23.5
3,3,49,Male,169,82.0,150,100,Normal,Normal,False,False,True,True,28.7
4,4,48,Female,156,56.0,100,60,Normal,Normal,False,False,False,False,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,Male,168,76.0,120,80,Normal,Normal,True,False,True,False,26.9
69996,99995,62,Female,158,126.0,140,90,High,High,False,False,True,True,50.5
69997,99996,53,Male,183,105.0,180,90,Extremely High,Normal,False,True,False,True,31.4
69998,99998,62,Female,163,72.0,135,80,Normal,High,False,False,False,True,27.1


In [68]:
print(cardio.shape)
print(cardio.dtypes)

(70000, 14)
id                      int64
age                     int32
gender                 object
height                  int64
weight                float64
systolic                int64
diastolic               int64
cholesterol          category
glucose              category
smoke                    bool
alcohol_intake           bool
physical_activity        bool
cv_disease               bool
bmi                   float64
dtype: object


In [70]:
cardio.to_csv("cardio_final.csv", index=False)