In [1]:
import os
import pandas as pd
import sklearn

In [8]:
# Stay in root directory for consistency
if '/src' in os.getcwd():
    os.chdir('..')

## Data

In [69]:
# Maybe one day when this function grows up it may be sourced from its own .py file.

def load_preg_data(sim=True, onehots=True):
    if sim:
        path = 'data/sim/'
    else:
        path = 'data/cdc'
        raise NotImplementedError('No unsimulated data yet')        
    # Load from CSV
    data = [pd.read_csv(f'{path}{file}.csv') for file in ['train', 'test', 'val']]
    # Separate and format the data
    X = []
    Y = []
    for df in data:
        # Separate labels and features
        y = df['outcome']
        x = df.drop(columns=['outcome', 'g.weeks', 'id'])
        # Condense or remove one-hot vectors
        if onehots:
            race_oh = x.filter(regex=('race_.*')).values.tolist()
            education_oh = x.filter(regex=('education_.*')).values.tolist()
            x = x.loc[:, ~x.columns.str.contains('race')]
            x = x.loc[:, ~x.columns.str.contains('education')]
            x['race_oh'] = race_oh
            x['education_oh'] = education_oh
        else:
            x = x.drop(columns=[c for c in x.columns if "race_" in c or "education_" in c])
        # Continue
        X.append(x)
        Y.append(y)
    # Done!
    x_train, x_test, x_val = X
    y_train, y_test, y_val = Y
    if onehots:
        print("Prototype one-hot vector for race:", data[0].filter(regex=('race_.*')).columns.values.tolist())
        print("Prototype one-hot vector for education:", data[0].filter(regex=('education_.*')).columns.values.tolist())
    return x_train, y_train, x_test, y_test, x_val, y_val

def preg_outcome_to_binaries(y):
    late_still = (y == 'late stillbirth').rename('late_still')
    early_still = (y == 'early stillbirth').rename('early_still')
    preterm = (y == 'preterm').rename('preterm')
    return late_still, early_still, preterm

def preg_outcome_to_onehot(y):
    oh = y.str.get_dummies()
    print("Protype one-hot vector for outcome:", oh.columns.values.tolist())
    return oh.values

In [70]:
x_train, y_train, x_test, y_test, x_val, y_val = load_preg_data()

Prototype one-hot vector for race: ['race_amerindian', 'race_asian', 'race_black', 'race_white']
Prototype one-hot vector for education: ['education_8th', 'education_College', 'education_Doctorate', 'education_HS']


In [74]:
print(preg_outcome_to_binaries(y_val))
print(preg_outcome_to_onehot(y_val))

(0       False
1       False
2        True
3        True
4       False
        ...  
1080    False
1081    False
1082    False
1083    False
1084    False
Name: late_still, Length: 1085, dtype: bool, 0       False
1       False
2       False
3       False
4       False
        ...  
1080    False
1081    False
1082    False
1083    False
1084    False
Name: early_still, Length: 1085, dtype: bool, 0        True
1        True
2       False
3       False
4        True
        ...  
1080    False
1081    False
1082    False
1083    False
1084     True
Name: preterm, Length: 1085, dtype: bool)
Protype one-hot vector for outcome: ['early stillbearth', 'late stillbirth', 'normal', 'preterm']
[[0 0 0 1]
 [0 0 0 1]
 [0 1 0 0]
 ...
 [0 0 1 0]
 [0 0 1 0]
 [0 0 0 1]]


## Model

In [None]:
abc