# Exploratory Data Analysis

In [71]:
# Imports...

import pandas as pd 
import numpy as np
from matplotlib import pyplot
import plotly.graph_objects as go
import plotly.express as px

## 1. Data intake and preliminary inspection

In [72]:
df = pd.read_csv("../Datos/heart_2020_cleaned.csv")

In [73]:
df.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes
7,No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,9.0,Yes,No,No
8,No,26.45,No,No,No,0.0,0.0,No,Female,80 or older,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No
9,No,40.69,No,No,No,0.0,0.0,Yes,Male,65-69,White,No,Yes,Good,10.0,No,No,No


In [74]:
df.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [75]:
df.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [76]:
categorical = df.select_dtypes(include=object)

In [77]:
categorical.head(5)

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,Asthma,KidneyDisease,SkinCancer
0,No,Yes,No,No,No,Female,55-59,White,Yes,Yes,Very good,Yes,No,Yes
1,No,No,No,Yes,No,Female,80 or older,White,No,Yes,Very good,No,No,No
2,No,Yes,No,No,No,Male,65-69,White,Yes,Yes,Fair,Yes,No,No
3,No,No,No,No,No,Female,75-79,White,No,No,Good,No,No,Yes
4,No,No,No,No,Yes,Female,40-44,White,No,Yes,Very good,No,No,No


In [78]:
pd.value_counts(categorical['HeartDisease']) # La variable target está bastante desbalanceada
    

No     292422
Yes     27373
Name: HeartDisease, dtype: int64

In [79]:
for col in categorical.columns:
    print("\n" + str(col))
    print("================")
    print(pd.value_counts(categorical[col]))


HeartDisease
No     292422
Yes     27373
Name: HeartDisease, dtype: int64

Smoking
No     187887
Yes    131908
Name: Smoking, dtype: int64

AlcoholDrinking
No     298018
Yes     21777
Name: AlcoholDrinking, dtype: int64

Stroke
No     307726
Yes     12069
Name: Stroke, dtype: int64

DiffWalking
No     275385
Yes     44410
Name: DiffWalking, dtype: int64

Sex
Female    167805
Male      151990
Name: Sex, dtype: int64

AgeCategory
65-69          34151
60-64          33686
70-74          31065
55-59          29757
50-54          25382
80 or older    24153
45-49          21791
75-79          21482
18-24          21064
40-44          21006
35-39          20550
30-34          18753
25-29          16955
Name: AgeCategory, dtype: int64

Race
White                             245212
Hispanic                           27446
Black                              22939
Other                              10928
Asian                               8068
American Indian/Alaskan Native      5202
Name: Race

## 2. Basic visualizations

### 2.1 Distribution of clients by ethnicity

In [80]:
level_count = df["Race"].value_counts()
level_count

White                             245212
Hispanic                           27446
Black                              22939
Other                              10928
Asian                               8068
American Indian/Alaskan Native      5202
Name: Race, dtype: int64

In [81]:
# Mismo gráfico pero en horizontal

level_count.sort_values(ascending = True,inplace = True) # Reordeno para que me quede de mayor a menor el gráfico

colors =  ["red","gold","mediumseagreen","lightblue","indigo",'black']

colors.reverse()

data = [
    go.Bar(
        y = level_count.index,
        x = level_count,
        name = "Clients race",
        marker_color = colors,
        width= np.repeat(0.65,len(level_count)), # Cambiar la anchura de cada barra,
        orientation = 'h'
    )
]

layout = go.Layout(title = "Distribution of clients according to race", yaxis_title = "Ethnic group", 
                   xaxis_title = "Number of clients",
                  )

fig = go.Figure(data = data, layout = layout)

fig.show()

### 2.1 Distribution of clients by sex

In [90]:
level_count = df["Sex"].value_counts()
level_count

Female    167805
Male      151990
Name: Sex, dtype: int64

In [91]:
# Mismo gráfico pero en horizontal

level_count.sort_values(ascending = True,inplace = True) # Reordeno para que me quede de mayor a menor el gráfico

colors =  ["red","gold","mediumseagreen","lightblue","indigo",'black']

colors.reverse()

data = [
    go.Bar(
        y = level_count.index,
        x = level_count,
        name = "Clients Sex",
        marker_color = colors,
        width= np.repeat(0.65,len(level_count)), # Cambiar la anchura de cada barra,
        orientation = 'h'
    )
]

layout = go.Layout(title = "Distribution of clients according to sex", yaxis_title = "Sex", 
                   xaxis_title = "Number of clients",
                  )

fig = go.Figure(data = data, layout = layout)

fig.show()

### 2.2 Incidence of diabetes among clients

In [84]:
level_count = df["Diabetic"].value_counts()
level_count

No                         269653
Yes                         40802
No, borderline diabetes      6781
Yes (during pregnancy)       2559
Name: Diabetic, dtype: int64

In [85]:
# Mismo gráfico pero en horizontal

level_count.sort_values(ascending = True,inplace = True) # Reordeno para que me quede de mayor a menor el gráfico

colors =  ["lightblue", "seagreen", "mediumseagreen", "lightgreen"]

colors.reverse()

data = [
    go.Bar(
        y = level_count.index,
        x = level_count,
        name = "Clients race",
        marker_color = colors,
        width= np.repeat(0.65,len(level_count)), # Cambiar la anchura de cada barra,
        orientation = 'h'
    )
]

layout = go.Layout(title = "Distribution of clients according to race", yaxis_title = "Ethnic group", 
                   xaxis_title = "Number of clients",
                  )

fig = go.Figure(data = data, layout = layout)

fig.show()

## Encoding of categorical variables

We will use **one hot** encoding.

In [86]:
dummies = pd.get_dummies(categorical, drop_first=True)

In [87]:
dummies.head(5)

Unnamed: 0,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,AgeCategory_25-29,AgeCategory_30-34,AgeCategory_35-39,AgeCategory_40-44,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,0,1,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,1,0,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,1,0,1,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0


In [88]:
dummies.columns

Index(['HeartDisease_Yes', 'Smoking_Yes', 'AlcoholDrinking_Yes', 'Stroke_Yes',
       'DiffWalking_Yes', 'Sex_Male', 'AgeCategory_25-29', 'AgeCategory_30-34',
       'AgeCategory_35-39', 'AgeCategory_40-44', 'AgeCategory_45-49',
       'AgeCategory_50-54', 'AgeCategory_55-59', 'AgeCategory_60-64',
       'AgeCategory_65-69', 'AgeCategory_70-74', 'AgeCategory_75-79',
       'AgeCategory_80 or older', 'Race_Asian', 'Race_Black', 'Race_Hispanic',
       'Race_Other', 'Race_White', 'Diabetic_No, borderline diabetes',
       'Diabetic_Yes', 'Diabetic_Yes (during pregnancy)',
       'PhysicalActivity_Yes', 'GenHealth_Fair', 'GenHealth_Good',
       'GenHealth_Poor', 'GenHealth_Very good', 'Asthma_Yes',
       'KidneyDisease_Yes', 'SkinCancer_Yes'],
      dtype='object')