In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import print_column_with_nan, clean_value, get_columns_with_nan

In [10]:
df = pd.read_csv('../data-new/data-selected-features.csv')

In [1]:
FEATURES = ['I RENAL AGUDA', 'NUMERO VASOS', 'SANGRADO MAYOR*', 'FUNCION VENTRICULAR IZQ', 'CRM', 'CREAT', 'KILLIP Ingreso', 'GB', 'Peor KILLIP', 'INOTROPICOS', 'TAS INGRESO', 'GLUCEMIA INGR']

NUMERICAL_FEATURES = ['CREAT', 'TAS INGRESO', 'GB', 'GLUCEMIA INGR']

CATEGORICAL_MULTI_CLASS_FEATURES = ['NUMERO VASOS', 'FUNCION VENTRICULAR IZQ', 'KILLIP Ingreso', 'Peor KILLIP', ]
CATEGORICAL_BINARY_FEATURES = ['I RENAL AGUDA', 'SANGRADO MAYOR*', 'CRM',  'INOTROPICOS',]

CATEGORY_FEATURES = CATEGORICAL_BINARY_FEATURES + CATEGORICAL_MULTI_CLASS_FEATURES

TARGET = "MUERTE HOSP"

In [12]:
print_column_with_nan(df)

I RENAL AGUDA NaNs: 2 - 0.11001100110011001%
NUMERO VASOS NaNs: 2 - 0.11001100110011001%
SANGRADO MAYOR* NaNs: 4 - 0.22002200220022003%
FUNCION VENTRICULAR IZQ NaNs: 8 - 0.44004400440044006%
CRM NaNs: 3 - 0.16501650165016502%
CREAT NaNs: 10 - 0.5500550055005501%
KILLIP Ingreso NaNs: 0 - 0.0%
GB NaNs: 66 - 3.6303630363036303%
Peor KILLIP NaNs: 0 - 0.0%
INOTROPICOS NaNs: 18 - 0.9900990099009901%
TAS INGRESO NaNs: 12 - 0.6600660066006601%
GLUCEMIA INGR NaNs: 83 - 4.565456545654565%
MUERTE HOSP NaNs: 0 - 0.0%


In [13]:
print_column_with_nan(df, 2)

GB NaNs: 66 - 3.6303630363036303%
GLUCEMIA INGR NaNs: 83 - 4.565456545654565%


Vamos a hacer una imputación de los datos faltantes. Para las variables numericas, usamos medias. Para las categoricas, usamos moda.

Elegimos este método ya que es el más sencillo y no afecta la distribución de las variables.

Hacemos un fit de los datos de entrenamiento para obtener las medias y modas de las variables. Luego usamos el fit para imputar los datos faltantes de train, val y test.

In [14]:
X_train = pd.read_csv('../data-new/train/X_train.csv')
y_train = pd.read_csv('../data-new/train/y_train.csv')
X_val = pd.read_csv('../data-new/val/X_val.csv')
y_val = pd.read_csv('../data-new/val/y_val.csv')
X_test = pd.read_csv('../data-new/test/X_test.csv')
y_test = pd.read_csv('../data-new/test/y_test.csv')

In [15]:
from sklearn.impute import SimpleImputer

median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

median_imputer.fit(X_train[NUMERICAL_FEATURES])
mode_imputer.fit(X_train[CATEGORY_FEATURES])

X_train_imputed = X_train.copy()
X_val_imputed = X_val.copy()
X_test_imputed = X_test.copy()

X_train_imputed[NUMERICAL_FEATURES] = median_imputer.transform(X_train[NUMERICAL_FEATURES])
X_train_imputed[CATEGORY_FEATURES] = mode_imputer.transform(X_train[CATEGORY_FEATURES])

X_val_imputed[NUMERICAL_FEATURES] = median_imputer.transform(X_val[NUMERICAL_FEATURES])
X_val_imputed[CATEGORY_FEATURES] = mode_imputer.transform(X_val[CATEGORY_FEATURES])

X_test_imputed[NUMERICAL_FEATURES] = median_imputer.transform(X_test[NUMERICAL_FEATURES])
X_test_imputed[CATEGORY_FEATURES] = mode_imputer.transform(X_test[CATEGORY_FEATURES])

In [16]:
X_train_imputed.to_csv('../data-new/train/X_train_imputed.csv', index=False)
X_val_imputed.to_csv('../data-new/val/X_val_imputed.csv', index=False)
X_test_imputed.to_csv('../data-new/test/X_test_imputed.csv', index=False)

La imputación de los datos la vamos a hacer en dos partes. Para aquellas features que tienen valores faltantes mayores o iguales al 2% del total, imputamos mediante la técnica MICE (Multiple Imputation by Chained Equations). Para las features con valores faltantes menores a 2%, hacemos una imputación aleatoria ya que no afecta la distribución de la feature en particular.