# Setup

In [1]:
import gc
import numpy as np
import os
import sklearn.metrics
import tensorflow as tf

import models
import util

SEED = 2021
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Stay in top-level directory for consistency
if '/src' in os.getcwd():
    os.chdir('..')

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
# Load data
xtrain, ytrain, xtest, ytest, xval, yval = util.load_preg_data_final(datafile='stillbirth')
# Convert class labels to binary labels
ytrain = util.outcome_to_binary(ytrain, outcome='early stillbirth')  # Choose between early stillbirth, late stillbirth, and preterm
ytest = util.outcome_to_binary(ytest, outcome='early stillbirth')
yval = util.outcome_to_binary(yval, outcome='early stillbirth')

# Don't keep the vars we won't use in this notebook, for memory reasons
gc.collect()

Loading train data for stillbirth...
	done.
Loading test data for stillbirth...
	done.
Loading val data for stillbirth...
	done.


0

In [3]:
# Curious about our columns --- check for normalization, mins and maxes
for c in xtrain.columns:
    display(xtrain[c].describe())


count    9.810617e+06
mean     2.862361e+01
std      5.704697e+00
min      1.800000e+01
25%      2.400000e+01
50%      2.900000e+01
75%      3.300000e+01
max      5.000000e+01
Name: Age, dtype: float64

count    9.810617e+06
mean    -3.676231e-03
std      9.918322e-01
min     -4.680834e-01
25%     -4.680834e-01
50%     -4.680834e-01
75%      6.693553e-01
max      3.365508e+01
Name: PrevTerminations, dtype: float64

count    9.810617e+06
mean     4.159396e-01
std      4.928832e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: WIC, dtype: float64

count    9.810617e+06
mean     2.963197e-03
std      1.004297e+00
min     -2.651162e-01
25%     -2.651162e-01
50%     -2.651162e-01
75%     -2.651162e-01
max      1.883936e+01
Name: PrepregnancySmoking, dtype: float64

count    9.810617e+06
mean    -8.157142e-04
std      9.993250e-01
min     -2.075606e+00
25%     -7.249668e-01
50%     -2.393437e-01
75%      5.042666e-01
max      6.407622e+00
Name: BMI, dtype: float64

count    9.810617e+06
mean    -8.810375e-04
std      9.979819e-01
min     -1.059895e+01
25%     -7.655846e-01
50%     -6.320109e-02
75%      6.391824e-01
max      4.853484e+00
Name: Height, dtype: float64

count    9.810617e+06
mean    -9.134448e-05
std      9.930383e-01
min     -1.668552e+00
25%     -8.833965e-01
50%     -9.824141e-02
75%      6.869137e-01
max      4.612689e+00
Name: Parity, dtype: float64

count    9.810617e+06
mean     8.175225e-03
std      9.004661e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: PrepregnancyDiabetes, dtype: float64

count    9.810617e+06
mean     5.708285e-02
std      2.320009e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: GestationalDiabetes, dtype: float64

count    9.810617e+06
mean     1.645972e-02
std      1.272352e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: PrepregnancyHypertension, dtype: float64

count    9.810617e+06
mean     5.456293e-02
std      2.271251e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: GestationalHypertension, dtype: float64

count    9.810617e+06
mean     2.434811e-03
std      4.928370e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: HypertensionEclampsia, dtype: float64

count    9.810617e+06
mean     1.656297e-02
std      1.276270e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: InfertilityTreatment, dtype: float64

count    9.810617e+06
mean     1.524984e-01
std      3.595033e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: PrevCesareans, dtype: float64

count    9.810617e+06
mean     1.096333e-02
std      1.041304e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: race_AmeriIndian, dtype: float64

count    9.810617e+06
mean     7.026031e-02
std      2.555852e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: race_AsianPI, dtype: float64

count    9.810617e+06
mean     1.559738e-01
std      3.628305e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: race_Black, dtype: float64

count    9.810617e+06
mean     7.628026e-01
std      4.253643e-01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: race_White, dtype: float64

count    9.810617e+06
mean     9.814673e-03
std      9.858167e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_, dtype: float64

count    9.810617e+06
mean     8.269409e-02
std      2.754193e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_Associate, dtype: float64

count    9.810617e+06
mean     1.975937e-01
std      3.981839e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_Bachelor, dtype: float64

count    9.810617e+06
mean     2.141386e-01
std      4.102235e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_CollIncomplete, dtype: float64

count    9.810617e+06
mean     2.557698e-02
std      1.578696e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_Doctorate, dtype: float64

count    9.810617e+06
mean     2.491524e-01
std      4.325223e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_HSGraduate, dtype: float64

count    9.810617e+06
mean     9.845171e-02
std      2.979245e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_HSIncomplete, dtype: float64

count    9.810617e+06
mean     3.384150e-02
std      1.808211e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_lteq8, dtype: float64

count    9.810617e+06
mean     8.873631e-02
std      2.843628e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: education_Master, dtype: float64

# Logistic Regression

In [4]:
lr_early = models.build_logreg()
models.fit_logreg(lr_early, xtrain, ytrain)
models.save_pickle(lr_early, 'models/lr_early')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
print(lr_early.score(xtest, ytest))
print(sklearn.metrics.classification_report(ytest, lr_early.predict(xtest), digits=4))
del lr_early
gc.collect()

0.7574489035079895
              precision    recall  f1-score   support

           0     0.9992    0.7576    0.8618   1398616
           1     0.0060    0.7009    0.0118      2899

    accuracy                         0.7574   1401515
   macro avg     0.5026    0.7292    0.4368   1401515
weighted avg     0.9971    0.7574    0.8600   1401515



68

# Gradient Boosting

In [6]:
gb_early = models.build_gbdt()
models.fit_gbdt(gb_early, xtrain, ytrain, xval, yval)
models.save_pickle(gb_early, 'models/gb_early')

Training until validation scores don't improve for 500 rounds
[100]	valid_0's auc: 0.866562	valid_0's binary_error: 0.018647	valid_0's binary_logloss: 0.217468
[200]	valid_0's auc: 0.870975	valid_0's binary_error: 0.0773962	valid_0's binary_logloss: 0.310769
[300]	valid_0's auc: 0.8723	valid_0's binary_error: 0.104443	valid_0's binary_logloss: 0.348214
[400]	valid_0's auc: 0.873174	valid_0's binary_error: 0.113286	valid_0's binary_logloss: 0.361069
[500]	valid_0's auc: 0.873729	valid_0's binary_error: 0.115914	valid_0's binary_logloss: 0.364633
[600]	valid_0's auc: 0.874022	valid_0's binary_error: 0.11652	valid_0's binary_logloss: 0.364965
[700]	valid_0's auc: 0.874321	valid_0's binary_error: 0.116187	valid_0's binary_logloss: 0.363983
[800]	valid_0's auc: 0.874543	valid_0's binary_error: 0.115494	valid_0's binary_logloss: 0.362405
[900]	valid_0's auc: 0.874636	valid_0's binary_error: 0.114882	valid_0's binary_logloss: 0.360653
[1000]	valid_0's auc: 0.874636	valid_0's binary_error: 0.1

In [7]:
print(gb_early.score(xtest, ytest))
print(sklearn.metrics.classification_report(ytest, gb_early.predict(xtest), digits=4))
del gb_early
gc.collect()

0.8864207660995422
              precision    recall  f1-score   support

           0     0.9993    0.8868    0.9397   1398616
           1     0.0124    0.6826    0.0243      2899

    accuracy                         0.8864   1401515
   macro avg     0.5058    0.7847    0.4820   1401515
weighted avg     0.9972    0.8864    0.9378   1401515



23

# Neural nets

In [8]:
selu_early = models.build_NN_selu(input_len=xtrain.shape[1])  # Assuming xtrain is (batch_size, n_features)
selu_early.summary()

models.fit_NN_selu(selu_early, xtrain, ytrain, xval, yval)
models.save_NN(selu_early, 'models/selu_early')

Model: "NN_selu"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 27)]              0         
_________________________________________________________________
selu1 (Dense)                (None, 27)                756       
_________________________________________________________________
alpha_dropout (AlphaDropout) (None, 27)                0         
_________________________________________________________________
selu2 (Dense)                (None, 27)                756       
_________________________________________________________________
alpha_dropout_1 (AlphaDropou (None, 27)                0         
_________________________________________________________________
selu3 (Dense)                (None, 27)                756       
_________________________________________________________________
alpha_dropout_2 (AlphaDropou (None, 27)                0   

In [9]:
selu_early.evaluate(xtest, ytest)  # Output is [loss, accuracy, auc]
del selu_early
gc.collect()



18677

In [10]:
lrelu_early = models.build_NN_lrelu(input_len=xtrain.shape[1])  # Assuming xtrain is (batch_size, n_features)
lrelu_early.summary()

models.fit_NN_lrelu(lrelu_early, xtrain, ytrain, xval, yval)
models.save_NN(lrelu_early, 'models/lrelu_early')

Model: "NN_lrelu"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 27)]              0         
_________________________________________________________________
lrelu1 (Dense)               (None, 70)                1960      
_________________________________________________________________
dropout (Dropout)            (None, 70)                0         
_________________________________________________________________
lrelu2 (Dense)               (None, 80)                5680      
_________________________________________________________________
dropout_1 (Dropout)          (None, 80)                0         
_________________________________________________________________
output (Dense)               (None, 1)                 81        
Total params: 7,721
Trainable params: 7,721
Non-trainable params: 0
________________________________________________________



INFO:tensorflow:Assets written to: models/lrelu_early/assets


INFO:tensorflow:Assets written to: models/lrelu_early/assets


In [11]:
lrelu_early.evaluate(xtest, ytest)  # Output is [loss, accuracy, auc]
del lrelu_early
gc.collect()



11747