In [6]:
import pandas as pd
from utils import print_column_with_nan
from sklearn.model_selection import train_test_split

DATASET_PATH = '../data-new/data-without-troponina.csv'
df = pd.read_csv(DATASET_PATH)

Vamos a crear un dataset sin feature selection para entrenar algunas redes neuronales pequeñas. Aplicaremos un split en los datos para obtener sets de train, validacion y test. Luego haremos una imputacion en los datos y finalmente un escalado de las variables. Vamos a usar el mismo codigo que en los notebooks anteriores.

In [1]:
# Asumptions
NUMERICAL_FEATURES = ['EDAD','PESO Kg', 'ALTURA cm', 'CREAT', 'GLUCEMIA INGR', 'GB', 'TAS INGRESO', 'FC INGRESO']
NUMERICAL_CONTINOUS = NUMERICAL_FEATURES

CATEGORICAL_MULTI_CLASS_FEATURES = ['KILLIP Ingreso', 'FUNCION VENTRICULAR IZQ', 'DIAGNOSTICO', 'NUMERO VASOS', 'Peor KILLIP']
CATEGORICAL_BINARY_FEATURES = ['NUMERO VASOS', 'SEX0', 'HTA', 'DBT', 'DLP', 'TABAQ', 'ANTEC IAM / Angina inestable', 'ANTEC BY PASS', 'ANTEC ATC', 'ANTEC ACE', 'ANTEC INS RENAL', 'ACV TIA PREV', 'EPOC', 'CLAUD ITTE', 'BB PREV', ' IECA/AT2  PREV', 'B CA PREV', 'AAS PREV', 'HIPOGLUCEMIANTES', 'DIURETICOS', 'INSULINA', 'ECG INFRA ST', 'ECG SUPRA ST', 'ECG INV T', 'BRI', 'BRD', 'ECG MCP', 'ECG FA', 'tropst', 'TnT Ultrasensible', 'Elevacion troponina T', 'AAS', 'BB', 'IECA', 'AT2', 'Clopi - prasu - tica', 'HEP sc o iv', 'IIb IIIa', 'B CA', 'ESTATINAS', 'HIPOGLUC METFORM', 'HIPOGL SULFAN', 'HIPOGL GLITAZ', 'INSULINA', 'INS + HIPOGL', 'INOTROPICOS', 'PRUEBA FUNC.', 'PRUEBA FUNC DE ALTO RIESGO', 'CCG', 'TRONCO', 'ATC PRIMARIA', 'ATC INTRAHOSP', 'TROMBOL', 'CRM', 'IAM HOSP(SI INTERNO POR ANGINA) O REIAM', 'APIAM', 'ANGINA REFRACT o RECURR', ' ACV/TIA', 'SANGRADO MAYOR*', 'I RENAL AGUDA']

CATEGORY_FEATURES = CATEGORICAL_BINARY_FEATURES + CATEGORICAL_MULTI_CLASS_FEATURES

TARGET = "MUERTE HOSP"

In [5]:
len(CATEGORY_FEATURES) + len(NUMERICAL_FEATURES)

73

## Data split

Vamos a hacer el split en train, validation y test. El porcentaje sera del 70% para train, 15% para validation y 15% para test. Esto nos deja con suficientes datos para entrenar y validar nuestros modelos.

El resultado de este split va a ser 6 archivos. Dos archivos para train y test con X e y. Dos archivos para validation y test con X e y.

In [7]:
X = df.drop(columns=TARGET)
y = df[[TARGET]]

# First split: separate test set (15%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, 
    test_size=0.15,
    random_state=42,
    stratify=y
)

# Second split: from remaining 85%, get 70/15 ratio
# To get 70% total from the 85% remaining, we need test_size = 15/85 ≈ 0.176
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=(15/85),  # This ensures we get 15% of total from the remaining 85%
    random_state=42,
    stratify=y_temp
)

In [9]:
X.shape

(1818, 72)

In [4]:
# Verify the proportions
total_samples = len(X)
print(f"Total samples: {total_samples}")
print(f"Training samples: {len(X_train)} ({len(X_train)/total_samples:.1%})")
print(f"Validation samples: {len(X_val)} ({len(X_val)/total_samples:.1%})")
print(f"Test samples: {len(X_test)} ({len(X_test)/total_samples:.1%})")

Total samples: 1818
Training samples: 1272 (70.0%)
Validation samples: 273 (15.0%)
Test samples: 273 (15.0%)


In [5]:
# Ver duplicados dentro de cada conjunto
print(f"Duplicados en X_train: {X_train.duplicated().sum()}")
print(f"Duplicados en X_val: {X_val.duplicated().sum()}")
print(f"Duplicados en X_test: {X_test.duplicated().sum()}")


Duplicados en X_train: 0
Duplicados en X_val: 0
Duplicados en X_test: 0


In [6]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(1272, 72)
(273, 72)
(273, 72)
(1272, 1)
(273, 1)
(273, 1)


In [7]:
# Remove duplicates between X_train and X_val, and drop from y_val accordingly
mask_val = ~X_val.isin(X_train.to_dict(orient='list')).all(axis=1)
X_val = X_val[mask_val]
y_val = y_val[mask_val]

# Remove duplicates between X_train and X_test, and drop from y_test accordingly
mask_test_train = ~X_test.isin(X_train.to_dict(orient='list')).all(axis=1)
X_test = X_test[mask_test_train]
y_test = y_test[mask_test_train]

# Remove duplicates between X_val and X_test, and drop from y_test accordingly
mask_test_val = ~X_test.isin(X_val.to_dict(orient='list')).all(axis=1)
X_test = X_test[mask_test_val]
y_test = y_test[mask_test_val]

# Ensure no internal duplicates in each set
X_train, y_train = X_train.drop_duplicates(), y_train.loc[X_train.index]
X_val, y_val = X_val.drop_duplicates(), y_val.loc[X_val.index]
X_test, y_test = X_test.drop_duplicates(), y_test.loc[X_test.index]

# Print final dataset shapes
print(f"New X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"New X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"New X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


New X_train shape: (1272, 72), y_train shape: (1272, 1)
New X_val shape: (123, 72), y_val shape: (123, 1)
New X_test shape: (105, 72), y_test shape: (105, 1)


## Missing values imputation

In [8]:
print_column_with_nan(df)

EDAD NaNs: 0 - 0.0%
SEX0 NaNs: 0 - 0.0%
HTA NaNs: 0 - 0.0%
DBT NaNs: 0 - 0.0%
DLP NaNs: 0 - 0.0%
TABAQ NaNs: 0 - 0.0%
ANTEC IAM / Angina inestable NaNs: 0 - 0.0%
ANTEC BY PASS NaNs: 0 - 0.0%
ANTEC ATC NaNs: 0 - 0.0%
ANTEC ACE NaNs: 0 - 0.0%
ANTEC INS RENAL NaNs: 0 - 0.0%
ACV TIA PREV NaNs: 0 - 0.0%
EPOC NaNs: 0 - 0.0%
CLAUD ITTE NaNs: 0 - 0.0%
BB PREV NaNs: 0 - 0.0%
 IECA/AT2  PREV NaNs: 0 - 0.0%
B CA PREV NaNs: 0 - 0.0%
AAS PREV NaNs: 0 - 0.0%
HIPOGLUCEMIANTES NaNs: 0 - 0.0%
DIURETICOS NaNs: 0 - 0.0%
INSULINA NaNs: 0 - 0.0%
PESO Kg NaNs: 93 - 5.115511551155116%
ALTURA cm NaNs: 120 - 6.600660066006601%
ECG INFRA ST NaNs: 0 - 0.0%
ECG SUPRA ST NaNs: 0 - 0.0%
ECG INV T NaNs: 0 - 0.0%
BRI NaNs: 1 - 0.05500550055005501%
BRD NaNs: 0 - 0.0%
ECG MCP NaNs: 0 - 0.0%
ECG FA NaNs: 1 - 0.05500550055005501%
tropst NaNs: 2 - 0.11001100110011001%
TnT Ultrasensible NaNs: 0 - 0.0%
Elevacion troponina T NaNs: 0 - 0.0%
CREAT NaNs: 10 - 0.5500550055005501%
GLUCEMIA INGR NaNs: 83 - 4.565456545654565%
GB Na

In [9]:
print_column_with_nan(df, 2)

PESO Kg NaNs: 93 - 5.115511551155116%
ALTURA cm NaNs: 120 - 6.600660066006601%
GLUCEMIA INGR NaNs: 83 - 4.565456545654565%
GB NaNs: 66 - 3.6303630363036303%


Vamos a hacer una imputación de los datos faltantes. Para las variables numericas, usamos medias. Para las categoricas, usamos moda.

Elegimos este método ya que es el más sencillo y no afecta la distribución de las variables.

Hacemos un fit de los datos de entrenamiento para obtener las medias y modas de las variables. Luego usamos el fit para imputar los datos faltantes de train, val y test.

In [10]:
from sklearn.impute import SimpleImputer

median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

median_imputer.fit(X_train[NUMERICAL_FEATURES])
mode_imputer.fit(X_train[CATEGORY_FEATURES])

X_train_imputed = X_train.copy()
X_val_imputed = X_val.copy()
X_test_imputed = X_test.copy()

X_train_imputed[NUMERICAL_FEATURES] = median_imputer.transform(X_train[NUMERICAL_FEATURES])
X_train_imputed[CATEGORY_FEATURES] = mode_imputer.transform(X_train[CATEGORY_FEATURES])

X_val_imputed[NUMERICAL_FEATURES] = median_imputer.transform(X_val[NUMERICAL_FEATURES])
X_val_imputed[CATEGORY_FEATURES] = mode_imputer.transform(X_val[CATEGORY_FEATURES])

X_test_imputed[NUMERICAL_FEATURES] = median_imputer.transform(X_test[NUMERICAL_FEATURES])
X_test_imputed[CATEGORY_FEATURES] = mode_imputer.transform(X_test[CATEGORY_FEATURES])

In [14]:
X_train_imputed = X_train_imputed.drop(columns=['INSULINA.1'])
X_val_imputed = X_val_imputed.drop(columns=['INSULINA.1'])
X_test_imputed = X_test_imputed.drop(columns=['INSULINA.1'])


## Variable Scaling

Realizamos un escalado de las variables numéricas

In [15]:
X_train_scaled = X_train_imputed.copy()
X_val_scaled = X_val_imputed.copy()
X_test_scaled = X_test_imputed.copy()

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled[NUMERICAL_FEATURES] = scaler.fit_transform(X_train_imputed[NUMERICAL_FEATURES])
X_val_scaled[NUMERICAL_FEATURES] = scaler.transform(X_val_imputed[NUMERICAL_FEATURES])
X_test_scaled[NUMERICAL_FEATURES] = scaler.transform(X_test_imputed[NUMERICAL_FEATURES])

## Save datasets

Finalmelte guardamos los datasets en otra carpeta

In [17]:
X_train_scaled.to_csv('../data-new/train/neural_network/X_train_scaled-no_selection.csv', index=False)
X_val_scaled.to_csv('../data-new/val/neural_network/X_val_scaled.csv-no_selection', index=False)
X_test_scaled.to_csv('../data-new/test/neural_network/X_test_scaled-no_selection.csv', index=False)

In [18]:
y_train.to_csv('../data-new/train/neural_network/y_train.csv', index=False)
y_val.to_csv('../data-new/val/neural_network/y_val.csv', index=False)
y_test.to_csv('../data-new/test/neural_network/y_test.csv', index=False)