In [1]:
# Bloque robusto de importación y configuración de rutas
import sys
import os
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)
import pandas as pd
import joblib
import preprocessing

In [3]:
# Carga de los tres datasets
# Dataset 1
ruta1 = '../data/dataset1_Medicaldataset.csv'
df1 = pd.read_csv(ruta1)
# Dataset 2
ruta2 = '../data/dataset2_heart.csv'
df2 = pd.read_csv(ruta2)
# Dataset 3
ruta3 = '../data/dataset3_heart_failure.csv'
df3 = pd.read_csv(ruta3)


# Preprocesamiento de los tres datasets

Este notebook realiza la limpieza, transformación y selección de atributos para los tres datasets clínicos utilizados en la predicción de riesgo de infarto.

In [4]:
# Limpieza de datos
# Dataset 1
clean1 = preprocessing.clean_data(df1)
# Dataset 2
clean2 = preprocessing.clean_data(df2)
# Dataset 3
clean3 = preprocessing.clean_data(df3)

# Mostrar tamaños después de limpieza
print('Tamaño limpio Dataset 1:', clean1.shape)
print('Tamaño limpio Dataset 2:', clean2.shape)
print('Tamaño limpio Dataset 3:', clean3.shape)

Tamaño limpio Dataset 1: (39, 9)
Tamaño limpio Dataset 2: (39, 14)
Tamaño limpio Dataset 3: (39, 13)


In [5]:
# Winsorización y log-transform según el artículo
# Dataset 1: aplicar winsorización y log a Heart rate, CK-MB, Diastolic blood pressure, Blood sugar, Troponin
cols1 = ['Heart rate', 'CK-MB', 'Diastolic blood pressure', 'Blood sugar', 'Troponin']
wins1 = preprocessing.winsorize_columns(clean1, cols1)
log1 = preprocessing.log_transform_columns(wins1, cols1)

# Dataset 2: log-transform a trestbps y chol
cols2 = ['trestbps', 'chol']
log2 = preprocessing.log_transform_columns(clean2, cols2)

# Dataset 3: log-transform a time, ejection_fraction, serum_creatinine, serum_sodium, age
cols3 = ['time', 'ejection_fraction', 'serum_creatinine', 'serum_sodium', 'age']
log3 = preprocessing.log_transform_columns(clean3, cols3)


In [6]:
# Selección de atributos (ANOVA F-score)
from sklearn.model_selection import train_test_split

# Dataset 1
X1 = log1.drop(columns=['Result'])
y1 = (log1['Result'] == 'positive').astype(int)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X1_sel, features1 = preprocessing.select_features_anova(X1_train, y1_train, k=4)

# Dataset 2
X2 = log2.drop(columns=['target'])
y2 = (log2['target'] > 0.5).astype(int)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
X2_sel, features2 = preprocessing.select_features_anova(X2_train, y2_train, k=5)

# Dataset 3
X3 = log3.drop(columns=['DEATH_EVENT'])
y3 = log3['DEATH_EVENT']
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)
X3_sel, features3 = preprocessing.select_features_anova(X3_train, y3_train, k=5)

print('Atributos seleccionados Dataset 1:', features1)
print('Atributos seleccionados Dataset 2:', features2)
print('Atributos seleccionados Dataset 3:', features3)

Atributos seleccionados Dataset 1: Index(['Gender', 'Diastolic blood pressure', 'CK-MB', 'Troponin'], dtype='object')
Atributos seleccionados Dataset 2: Index(['sex', 'trestbps', 'exang', 'oldpeak', 'slope'], dtype='object')
Atributos seleccionados Dataset 3: Index(['age', 'platelets', 'sex', 'smoking', 'time'], dtype='object')


In [7]:
# Normalización
X1_train_norm, X1_test_norm, scaler1 = preprocessing.normalize_data(X1_train[features1], X1_test[features1])
X2_train_norm, X2_test_norm, scaler2 = preprocessing.normalize_data(X2_train[features2], X2_test[features2])
X3_train_norm, X3_test_norm, scaler3 = preprocessing.normalize_data(X3_train[features3], X3_test[features3])

# Guardar conjuntos procesados y scalers para siguientes etapas
import joblib

joblib.dump((X1_train_norm, y1_train, X1_test_norm, y1_test), '../data/dataset1_processed.joblib')
joblib.dump((X2_train_norm, y2_train, X2_test_norm, y2_test), '../data/dataset2_processed.joblib')
joblib.dump((X3_train_norm, y3_train, X3_test_norm, y3_test), '../data/dataset3_processed.joblib')
joblib.dump((scaler1, scaler2, scaler3), '../data/scalers.joblib')

print('Preprocesamiento finalizado y datos guardados.')

Preprocesamiento finalizado y datos guardados.
