# 0.0 IMPUTS

## 0.1 Imports

In [5]:
import pandas as pd
import numpy as np
import csv

## 0.2 Helpers Functions

## 0.3 Loading Data

In [6]:
df_cardio_raw = pd.read_csv('data/cardio_train.csv', low_memory= False, delimiter=';')

# 1.0 DATA DESCRIPTION

In [46]:
df1 = df_cardio_raw.copy()

In [47]:
df1.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


## 1.1 Features Description

There are 3 types of input features:

**Objective:** factual information;

**Examination:** results of medical examination;

**Subjective:** information given by the patient.


### Features:

**- Age** | Objective Feature | age | int (days)

**- Height** | Objective Feature | height | int (cm) |

**- Weight** | Objective Feature | weight | float (kg) |

**- Gender** | Objective Feature | gender | categorical code |

**- Systolic blood pressure** | Examination Feature | ap_hi | int |

**- Diastolic blood pressure** | Examination Feature | ap_lo | int |

**- Cholesterol** | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |

**- Glucose** | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |

**- Smoking** | Subjective Feature | smoke | binary |

**- Alcohol intake** | Subjective Feature | alco | binary |

**- Physical activity** | Subjective Feature | active | binary |

**- Presence or absence of cardiovascular disease** | Target Variable | cardio | binary |


All of the dataset values were collected at the moment of medical examination.

## 1.2 Rename Columns

In [48]:
df1.rename(columns={'gluc': 'glucose'}, inplace = True)
df1.rename(columns={'alco': 'alcohol'},  inplace = True)

df1.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,glucose,smoke,alcohol,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


## 1.3 Data Dimentions

In [49]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of cols: {}'.format(df1.shape[1]))

Number of rows: 70000
Number of cols: 13


## 1.4 Data Types

In [50]:
df1.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
glucose          int64
smoke            int64
alcohol          int64
active           int64
cardio           int64
dtype: object

## 1.5 Check NA

In [51]:
df1.isna().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
glucose        0
smoke          0
alcohol        0
active         0
cardio         0
dtype: int64

## 1.6 Change Types

We're going to convert the age units from days to years so it becomes easier to interpret and check if there is any outlier. However, for the future calculations, we're going to keep the days column, because there's a chance that we may lose information by rounding the data.

In [52]:
df1['weight'] = df1['weight'].astype(int)

# converting days to years
df1['age_years'] = ((df1['age'])/365).round(2)
df1.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,glucose,smoke,alcohol,active,cardio,age_years
0,0,18393,2,168,62,110,80,1,1,0,0,1,0,50.39
1,1,20228,1,156,85,140,90,3,1,0,0,1,1,55.42
2,2,18857,1,165,64,130,70,3,1,0,0,0,1,51.66
3,3,17623,2,169,82,150,100,1,1,0,0,1,1,48.28
4,4,17474,1,156,56,100,60,1,1,0,0,0,0,47.87


In [53]:
df1.dtypes

id               int64
age              int64
gender           int64
height           int64
weight           int32
ap_hi            int64
ap_lo            int64
cholesterol      int64
glucose          int64
smoke            int64
alcohol          int64
active           int64
cardio           int64
age_years      float64
dtype: object

**Data types :** int64 (12), int32 (1), float64 (1).

## 1.7 Descriptive Statistical

In [54]:
num_attributes = df1.select_dtypes(include = ['int64', 'int32', 'float64'])

### 1.7.1 Numerical Attributes

In [55]:
# Central Tendency - mean, median
ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

# Dispersion - Std, min, max, range, skew, kurtosis
d1 = pd.DataFrame(num_attributes.apply(np.std)).T
d2 = pd.DataFrame(num_attributes.apply(min)).T
d3 = pd.DataFrame(num_attributes.apply(max)).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

# Concatenate
m = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,0.0,99999.0,99999.0,49972.4199,50001.5,28851.096242,-0.001278,-1.198374
1,age,10798.0,23713.0,12915.0,19468.865814,19703.0,2467.234044,-0.307055,-0.823447
2,gender,1.0,2.0,1.0,1.349571,1.0,0.476835,0.63096,-1.601935
3,height,55.0,250.0,195.0,164.359229,165.0,8.210068,-0.642187,7.943653
4,weight,10.0,200.0,190.0,74.204329,72.0,14.39585,1.01203,2.586746
5,ap_hi,-150.0,16020.0,16170.0,128.817286,120.0,154.010319,85.296214,7580.074738
6,ap_lo,-70.0,11000.0,11070.0,96.630414,80.0,188.471184,32.114083,1425.914585
7,cholesterol,1.0,3.0,2.0,1.366871,1.0,0.680245,1.587123,0.993901
8,glucose,1.0,3.0,2.0,1.226457,1.0,0.572266,2.397462,4.294377
9,smoke,0.0,1.0,1.0,0.088129,0.0,0.283482,2.905867,6.44425


In the data we can see possible outliers:


- Minimum **height** = 55.00, and minimum **weight** = 10.00, when **age_years** = 29.58;

- Maximum blood pressure **ap_hi** = 16170.00 and **ap_lo** = 11070.00. 

## 1.8 Outliers