## FeatureTools Feature Engineering

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import featuretools as ft
from woodwork.logical_types import Categorical

In [5]:
training = pd.read_csv('../data/training_v2.csv')

In [19]:
training.sort_values(by=['patient_id', 'encounter_id']).head(3)

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
25131,93505,1,55,1,53.0,25.925926,0,Caucasian,M,180.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
46822,106180,2,182,0,37.0,51.752881,0,African American,F,152.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
78310,53841,3,103,0,69.0,26.272919,0,,M,167.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular


In [21]:
training.groupby('patient_id').size().value_counts()

1    91713
dtype: int64

In [15]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91713 entries, 0 to 91712
Columns: 186 entries, encounter_id to apache_2_bodysystem
dtypes: float64(170), int64(8), object(8)
memory usage: 130.1+ MB


In [23]:
[x for x in training.columns]

['encounter_id',
 'patient_id',
 'hospital_id',
 'hospital_death',
 'age',
 'bmi',
 'elective_surgery',
 'ethnicity',
 'gender',
 'height',
 'hospital_admit_source',
 'icu_admit_source',
 'icu_id',
 'icu_stay_type',
 'icu_type',
 'pre_icu_los_days',
 'readmission_status',
 'weight',
 'albumin_apache',
 'apache_2_diagnosis',
 'apache_3j_diagnosis',
 'apache_post_operative',
 'arf_apache',
 'bilirubin_apache',
 'bun_apache',
 'creatinine_apache',
 'fio2_apache',
 'gcs_eyes_apache',
 'gcs_motor_apache',
 'gcs_unable_apache',
 'gcs_verbal_apache',
 'glucose_apache',
 'heart_rate_apache',
 'hematocrit_apache',
 'intubated_apache',
 'map_apache',
 'paco2_apache',
 'paco2_for_ph_apache',
 'pao2_apache',
 'ph_apache',
 'resprate_apache',
 'sodium_apache',
 'temp_apache',
 'urineoutput_apache',
 'ventilated_apache',
 'wbc_apache',
 'd1_diasbp_invasive_max',
 'd1_diasbp_invasive_min',
 'd1_diasbp_max',
 'd1_diasbp_min',
 'd1_diasbp_noninvasive_max',
 'd1_diasbp_noninvasive_min',
 'd1_heartrate_m

In [8]:
X_train, y_train, X_test, y_test = train_test_split(training, training['hospital_death'], test_size=0.2, random_state=42)

In [20]:
X_train.sort_values(by=['patient_id', 'encounter_id']).head(3)

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
25131,93505,1,55,1,53.0,25.925926,0,Caucasian,M,180.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
46822,106180,2,182,0,37.0,51.752881,0,African American,F,152.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
78310,53841,3,103,0,69.0,26.272919,0,,M,167.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular


In [31]:
# Create an empty entityset
es = ft.EntitySet(id='patient_encounters')

In [34]:
# Create an entity from the dataframe
es = es.entity_from_dataframe(entity_id='patient_encounters', dataframe=X_train, index='encounter_id')

AttributeError: 'EntitySet' object has no attribute 'entity_from_dataframe'

In [33]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="X_train",
    agg_primitives=["count", "min", "max", "mean", "mode"],
    trans_primitives=["percentile"],
    max_depth=1,
)

KeyError: 'Provided target dataframe X_train does not exist in patient_encounters'

In [None]:
feature_matrix

In [None]:
# Define variable types
variable_types = {
    'hospital_death': ft.variable_types.Boolean,
    'age': ft.variable_types.Numeric,
    'bmi': ft.variable_types.Numeric,
    'elective_surgery': ft.variable_types.Boolean,
    'ethnicity': ft.variable_types.Categorical,
    'gender': ft.variable_types.Categorical,
    'height': ft.variable_types.Numeric,
    'hospital_admit_source': ft.variable_types.Categorical,
    'icu_admit_source': ft.variable_types.Categorical,
    'icu_admit_type': ft.variable_types.Categorical,
    'icu_id': ft.variable_types.Id,
    'icu_stay_type': ft.variable_types.Categorical,
    'icu_type': ft.variable_types.Categorical,
    'pre_icu_los_days': ft.variable_types.Numeric,
    'readmission_status': ft.variable_types.Boolean,
    'weight': ft.variable_types.Numeric,
    'albumin_apache': ft.variable_types.Numeric,
    'apache_2_diagnosis': ft.variable_types.Categorical,
    'apache_3j_diagnosis': ft.variable_types.Categorical,
    'apache_post_operative': ft.variable_types.Boolean,
    'arf_apache': ft.variable_types.Boolean,
    'bilirubin_apache': ft.variable_types.Numeric,
    'bun_apache': ft.variable_types.Numeric,
    'creatinine_apache': ft.variable_types.Numeric,
    'fio2_apache': ft.variable_types.Numeric,
    'gcs_eyes_apache': ft.variable_types.Numeric,
    'gcs_motor_apache': ft.variable_types.Numeric,
    'gcs_unable_apache': ft.variable_types.Boolean,
    'gcs_verbal_apache': ft.variable_types.Numeric,
    'glucose_apache': ft.variable_types.Numeric,
    'heart_rate_apache': ft.variable_types.Numeric,
    'hematocrit_apache': ft.variable_types.Numeric,
    'intubated_apache': ft.variable_types.Boolean,
    'map_apache': ft.variable_types.Numeric,
    'paco2_apache': ft.variable_types.Numeric,
    'paco2_for_ph_apache': ft.variable_types.Numeric,
    'pao2_apache': ft.variable_types.Numeric,
    'ph_apache': ft.variable_types.Numeric,
    'resprate_apache': ft.variable_types.Numeric,
    'sodium_apache': ft.variable_types.Numeric,
    'temp_apache': ft.variable_types.Numeric,
    'urineoutput_apache': ft.variable_types.Numeric,
    'ventilated_apache': ft.variable_types.Boolean,
    'wbc_apache': ft.variable_types.Numeric,
    'd1_diasbp_invasive_max': ft.variable_types.Numeric,
    'd1_diasbp_invasive_min': ft.variable_types.Numeric,
    'd1_diasbp_max': ft.variable_types.Numeric,
    'd1_diasbp_min': ft.variable_types.Numeric,
    'd1_diasbp_noninvasive_max': ft.variable_types.Numeric,
    'd1_diasbp_noninvasive_min': ft.variable_types.Numeric,
    'd1_heartrate_max': ft.variable_types.Numeric,
    'd1_heartrate_min': ft.variable_types.Numeric,
    'd1_mbp_invasive_max': ft.variable_types.Numeric,
    'd1_mbp_invasive_min': ft.variable_types.Numeric,
    'd1_mbp_max': ft.variable_types.Numeric,
    'd1_mbp_min': ft.variable_types.Numeric,
    'd1_mbp_noninvasive_max': ft.variable_types.Numeric,
    'd1_mbp_noninvasive_min': ft.variable_types.Numeric,
    # Add more variable types for other columns as necessary
}

# Add the variable types to the entityset
es = es.add_variables(variable_types)

# Define the relationships between entities
relationships = []

# Perform deep feature synthesis
features, feature_defs = ft.dfs(
    entityset=es,
    target_entity='patients',
    agg_primitives=['count', 'mean', 'max', 'min', 'mode'],
    trans_primitives=['month', 'day', 'year'],
    verbose=True,
)

# Print the generated features
print(features.head())
