In [1]:
import numpy as np
import os
import pandas as pd
import sklearn.linear_model

import util

SEED = 2021
np.random.seed(SEED)

# Stay in top-level directory for consistency
if '/src' in os.getcwd():
    os.chdir('..')

## Data

In [2]:
# Load data
xtrain, ytrain, xtest, ytest, xval, yval = util.load_preg_data(sim=True, onehots=True)
# Create binary labels
ytrain_early, ytrain_late, ytrain_preterm = util.preg_outcome_to_binaries(ytrain)
ytest_early, ytest_late, ytest_preterm = util.preg_outcome_to_binaries(ytest)
yval_early, yval_late, yval_preterm = util.preg_outcome_to_binaries(yval)

## Model

In [3]:
logreg_preterm = sklearn.linear_model.LogisticRegression(
    solver='lbfgs', penalty='l2', tol=1e-4, C=1, max_iter=100, class_weight='balanced')

logreg_preterm.fit(xtrain, ytrain_preterm)

print(logreg_preterm.score(xtest, ytest_preterm))
print(sklearn.metrics.classification_report(ytest_preterm, logreg_preterm.predict(xtest), digits=4))

0.9695571955719557
              precision    recall  f1-score   support

       False     0.9646    0.9737    0.9691       532
        True     0.9744    0.9656    0.9700       552

    accuracy                         0.9696      1084
   macro avg     0.9695    0.9696    0.9696      1084
weighted avg     0.9696    0.9696    0.9696      1084



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
logreg_late = sklearn.linear_model.LogisticRegression(
    solver='lbfgs', penalty='l2', tol=1e-4, C=1, max_iter=100, class_weight='balanced')

logreg_late.fit(xtrain, ytrain_late)

print(logreg_late.score(xtest, ytest_late))
print(sklearn.metrics.classification_report(ytest_late, logreg_preterm.predict(xtest), digits=4))

0.985239852398524
              precision    recall  f1-score   support

       False     0.6015    0.3713    0.4591       870
        True     0.0000    0.0000    0.0000       214

    accuracy                         0.2980      1084
   macro avg     0.3007    0.1856    0.2296      1084
weighted avg     0.4827    0.2980    0.3685      1084



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
logreg_early = sklearn.linear_model.LogisticRegression(
    solver='lbfgs', penalty='l2', tol=1e-4, C=1, max_iter=100, class_weight='balanced')

logreg_early.fit(xtest, ytest_early)

print(logreg_early.score(xtest, ytest_early))
print(sklearn.metrics.classification_report(ytest_early, logreg_preterm.predict(xtest), digits=4))

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: False

In [6]:
print(yval_late.any())
print(yval_early.any())
print(yval_preterm.any())

True
False
True


So we just don't have enough examples of early stillbirths in the synthetic data. Hopefully real data will ameliorate this.