# Setup

In [17]:
%env KERAS_BACKEND=tensorflow

import numpy as np
import pandas as pd

from sutter.lib.model.pipeline import Pipeline
import sutter.lib.model.feature_selectors as fs
import sutter.lib.model.splitters as sp

pd.options.display.max_rows = None

env: KERAS_BACKEND=tensorflow


In [6]:
# "Dummy splitter" that uses the entire feature matrix for both the train and test sets.
class DummySplitter(sp.Splitter):
    def split(self, features_df, labels_df, label_period_days=30, seed=None):
        labels = self._prepare_labels_vector(labels_df, label_period_days)
        
        return [{
            "X_train": features_df,
            "X_test": features_df,
            "y_train": labels,
            "y_test": labels
        }]

In [2]:
%time p = Pipeline('features.csv')

Loaded 364942 rows.
Found 1667 columns:
 - 1478 boolean
 - 83 numeric
 - 15 categorical (106 when dummified)
 - 0 uncategorized
 * 4 labels
CPU times: user 3min 40s, sys: 2min 42s, total: 6min 23s
Wall time: 6min 33s


  exec(code, glob, local_ns)


In [8]:
p.build_train_and_test_sets(splitter=DummySplitter())
X = p.folds[0]['X_train']
y = p.folds[0]['y_train']

Removing 12002 visits where discharge was within 30 days of the end of our data, 2015-12-31


In [23]:
summary = X.describe()

# Summary statistics for 100-feature subset

In [9]:
fs_top100 = fs.TopCorrelationSelector(100)
fs_top100.fit(X, y)
top100_features = fs_top100.top_n_features

In [26]:
summary[top100_features].T.sort_index()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AdmissionExtractor__acuity_lace,323813,1.300019,1.486612,0,0.0,0.0,3.0,3.0
AdmissionExtractor__admission_type_cat_elective,323813,0.273627,0.44582,0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_type_cat_emergency,323813,0.43334,0.495537,0,0.0,0.0,1.0,1.0
BasicDemographicsExtractor__age,323813,58.621887,20.48278,18,40.0,60.0,76.0,115.0
BasicDemographicsExtractor__age^2,323813,3856.141081,2397.492831,324,1600.0,3600.0,5776.0,13225.0
BasicDemographicsExtractor__age^3,323813,274535.095392,231244.140823,5832,64000.0,216000.0,438976.0,1520875.0
BasicDemographicsExtractor__if_female_bool,323813,0.619233,0.485538,0,0.0,1.0,1.0,1.0
BasicDemographicsExtractor__marital_status_cat_married,323813,0.467569,0.498948,0,0.0,0.0,1.0,1.0
BasicDemographicsExtractor__race_cat_black,323813,0.11191,0.315257,0,0.0,0.0,0.0,1.0
BasicDemographicsExtractor__tabak_age,323813,6.901467,6.361422,0,0.0,6.0,12.4,28.0


# Summary statistics for 500-feature subset

In [10]:
fs_top500 = fs.TopCorrelationSelector(500)
fs_top500.fit(X, y)
top500_features = fs_top500.top_n_features

In [27]:
summary[top500_features].T.sort_index()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AdmissionExtractor__acuity_lace,323813,1.300019,1.486612,0.0,0.0,0.0,3.0,3.0
AdmissionExtractor__admission_source_cat_home,323813,0.930293,0.254653,0.0,1.0,1.0,1.0,1.0
AdmissionExtractor__admission_source_cat_transfer,323813,0.049729,0.217386,0.0,0.0,0.0,0.0,1.0
AdmissionExtractor__admission_time_cat_afternoon,323813,0.268034,0.442936,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_time_cat_evening,323813,0.435572,0.495832,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_type_cat_elective,323813,0.273627,0.44582,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_type_cat_emergency,323813,0.43334,0.495537,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_type_cat_other,323813,0.013835,0.116807,0.0,0.0,0.0,0.0,1.0
AdmissionExtractor__admission_type_cat_urgent,323813,0.279198,0.448606,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_weekday_cat_saturday,323813,0.103967,0.305219,0.0,0.0,0.0,0.0,1.0


# Summary statistics for full feature set

In [28]:
summary.T.sort_index()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AdmissionExtractor__acuity_lace,323813,1.300019,1.486612,0.0,0.0,0.0,3.0,3.0
AdmissionExtractor__admission_source_cat_home,323813,0.930293,0.254653,0.0,1.0,1.0,1.0,1.0
AdmissionExtractor__admission_source_cat_other,323813,0.018359,0.134247,0.0,0.0,0.0,0.0,1.0
AdmissionExtractor__admission_source_cat_outpatient,323813,0.00097,0.031125,0.0,0.0,0.0,0.0,1.0
AdmissionExtractor__admission_source_cat_transfer,323813,0.049729,0.217386,0.0,0.0,0.0,0.0,1.0
AdmissionExtractor__admission_time_cat_afternoon,323813,0.268034,0.442936,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_time_cat_evening,323813,0.435572,0.495832,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_time_cat_morning,323813,0.296393,0.456667,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_type_cat_elective,323813,0.273627,0.44582,0.0,0.0,0.0,1.0,1.0
AdmissionExtractor__admission_type_cat_emergency,323813,0.43334,0.495537,0.0,0.0,0.0,1.0,1.0
