In [40]:
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split
import warnings 
warnings.filterwarnings('ignore')
from flaml import logger, logging
logger.setLevel(logging.WARNING)

In [41]:
wids_df = pd.read_csv('training_v2.csv')
test = pd.read_csv('unlabeled.csv')
info = pd.read_csv('dict_info.csv')
wids_df['hospital_death'].value_counts()

0    83798
1     7915
Name: hospital_death, dtype: int64

In [42]:
X = wids_df.loc[:,wids_df.columns != 'hospital_death']
y = wids_df['hospital_death']

In [None]:
# IGNORE
# mapper = pd.DataFrameMapper(
#   [(continuous_col, preprocessing.StandardScaler()) for continuous_col in continuous_cols] +
#   [(categorical_col, preprocessing.LabelBinarizer()) for categorical_col in categorical_cols]
# )
# pipeline = preprocessing.Pipeline(
#   [("mapper", mapper),
#   ("estimator", estimator)]
# )
# pipeline.fit_transform(df, df["y"])

In [43]:

x_train, x_test, y_train, y_test = train_test_split(X,y,random_state=42)
#scaler = preprocessing.StandardScaler().fit(x_train)
#x_scaled = scaler.transform(x_train)

Runnning the model (took approx 9mins to run)

In [45]:

automl = AutoML()
automl.fit(x_train, y_train, task="classification", time_budget=400,verbose=0)

The best results:

In [46]:
automl.best_estimator

'xgboost'

In [47]:
automl.best_config

{'n_estimators': 721,
 'max_leaves': 248,
 'min_child_weight': 3.563471645105893,
 'learning_rate': 0.026815059544605373,
 'subsample': 0.827351358517848,
 'colsample_bylevel': 0.3309979126094584,
 'colsample_bytree': 0.7308072004331211,
 'reg_alpha': 0.0009765625,
 'reg_lambda': 0.37343817725165074}

DF of estimators and their configs tried (best)

In [51]:
configs_ests = pd.DataFrame.from_dict(automl.best_config_per_estimator)
configs_ests

Unnamed: 0,lgbm,rf,xgboost,extra_tree,xgb_limitdepth,lrl1
n_estimators,343.0,15,721.0,90,30.0,
num_leaves,103.0,,,,,
min_child_samples,2.0,,,,,
learning_rate,0.064725,,0.026815,,0.464317,
log_max_bin,10.0,,,,,
colsample_bytree,0.831798,,0.730807,,1.0,
reg_alpha,0.002015,,0.000977,,0.000977,
reg_lambda,62.376645,,0.373438,,0.086777,
max_features,,0.073721,,0.152882,,
max_leaves,,6,248.0,246,,


Losses by estimator

In [67]:
loss = sorted(automl.best_loss_per_estimator.items(),key=lambda item: item[1])
loss

[('xgboost', 0.09851426664406271),
 ('lgbm', 0.099444016176789),
 ('xgb_limitdepth', 0.11447781022660508),
 ('extra_tree', 0.12129952755502549),
 ('rf', 0.14625602551266836),
 ('lrl1', 0.38003272249215514)]

https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML link to types of estimators FLAML uses

Overall, xgboost and lgbm perform best.

In [98]:
feats_df = pd.DataFrame(automl.feature_importances_.T, columns=['value'], index=automl.feature_names_in_)

#### most important values from the model

In [107]:
feats_df.sort_values('value',ascending=False).head(30)

Unnamed: 0,value
ventilated_apache,0.034108
apache_4a_hospital_death_prob,0.032246
apache_4a_icu_death_prob,0.029346
d1_lactate_min,0.022601
gcs_motor_apache,0.021035
apache_post_operative,0.020926
gcs_eyes_apache,0.018312
elective_surgery,0.016138
d1_lactate_max,0.013959
d1_sysbp_min,0.009192


note: for looking at how age/gender/race affect things, we'd need to look at them after the fact and add back manually. As in, say we take top 20 factors in above df, but then we'd want to run our actual models with those 20 factors PLUS any feats about age/gender/race etc. to see if there's correlation

prediction

In [117]:
pred = automl.predict(x_test)

Accuracy of prediction

In [158]:
y = list(y_test)

In [161]:
correct = 0
for i in range(len(pred)):
    if pred[i] == y[i]:
        correct += 1
acc = ((correct / float(len(pred))) * 100.0)

In [160]:
acc

93.28361463648655