# Análisis Exploratorio de Datos

In [1]:
# Importar librerías necesarias
import pandas as pd
from analisis_univariante_morado import clean_to_float, dataset_summary, plot_univariate
from analisis_multivariante_morado import multivariate_summary_by_label, plot_numeric_by_label, plot_categorical_by_label ,plot_correlation_matrix

# Importar los datos
df = pd.read_csv('./datasets/train.csv')

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0
1,54.0,1.0,3.0,120.0,237.0,0.0,0.0,150.0,1.0,1.5,-9.0,-9.0,7.0,2
2,63.0,1.0,4.0,140.0,0.0,?,2.0,149.0,0.0,2.0,1.0,?,?,2
3,52.0,0.0,2.0,140.0,-9.0,0.0,0.0,140.0,0.0,0.0,-9.0,-9.0,-9.0,0
4,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3


## Limpieza básica de los datos

In [2]:
# Limpiar los datos
df_clean = clean_to_float(df)

## Resumen y análisis univariante

In [3]:
# Resumen del dataset
summary = dataset_summary(df_clean)

print("----- Shape -----")
print(summary["shape"])

print("\n----- Tipos de datos -----")
print(summary["dtypes"])

print("\n----- Valores nulos -----")
print(summary["missing_values"])

print("\n----- Estadísticos -----")
print(summary["basic_stats"])

----- Shape -----
(732, 14)

----- Tipos de datos -----
age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
label       float64
dtype: object

----- Valores nulos -----
age           0
sex           0
cp            0
trestbps     47
chol         21
fbs          66
restecg       0
thalach      44
exang        44
oldpeak      59
slope       247
ca          479
thal        379
label         0
dtype: int64

----- Estadísticos -----
          count        mean         std   min    25%    50%     75%    max
age       732.0   53.364754    9.306868  28.0   47.0   54.0   60.00   76.0
sex       732.0    0.789617    0.407859   0.0    1.0    1.0    1.00    1.0
cp        732.0    3.250000    0.923363   1.0    3.0    4.0    4.00    4.0
trestbps  685.0  131.975182   19.203305   0.0  120.0  130.0 

In [4]:
# Graficar análisis univariante
plot_univariate(df_clean)

## Análisis multivariante

In [5]:
# Resumen por label
summary_mv = multivariate_summary_by_label(df_clean, label_col="label")
print(summary_mv["counts_by_label"])
print(summary_mv["mean_by_label"])

label
0.0    327
1.0    156
2.0    108
3.0    107
4.0     34
dtype: int64
             age    trestbps        chol     thalach   oldpeak
label                                                         
0.0    50.461774  130.376206  229.552716  149.607717  0.410714
1.0    54.320513  128.891892  168.225806  134.601351  0.914286
2.0    56.314815  135.290000  175.257143  128.019802  1.394949
3.0    57.532710  136.557895  177.542857  121.718750  1.595745
4.0    54.411765  138.000000  218.181818  124.093750  2.225000


In [6]:
# Boxplots continuas vs label
plot_numeric_by_label(df_clean, label_col="label")

In [7]:
# Categóricas codificadas vs label
plot_categorical_by_label(df_clean, label_col="label")

In [None]:
# Matriz de correlación
corr = plot_correlation_matrix(df_clean)
print(corr["label"].sort_values(ascending=False))

label       1.000000
ca          0.501912
oldpeak     0.496292
thal        0.468602
exang       0.427005
cp          0.397885
slope       0.351605
sex         0.270454
age         0.264917
trestbps    0.138043
fbs         0.134370
restecg     0.049529
chol       -0.149165
thalach    -0.407338
Name: label, dtype: float64
