# Feature selection for all-APGAR model (one hot encoded features)

In [9]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

## Load data


In [10]:
data = pd.read_csv('./data/processed_data_for_model_one_hot.csv')

Choose to drop hopsitals from data.

In [11]:
drop_hospitals = True

if drop_hospitals:
    selected_cols = []
    for col in list(data):
        if col[0:8] == 'hospital':
            pass
        else:
            selected_cols.append(col)
    
    data = data[selected_cols]

In [12]:
data.head().T

Unnamed: 0,0,1,2,3,4
parity,1.0,2.0,0.0,2.0,1.0
prevcaes,0.0,0.0,0.0,0.0,0.0
noscans,1.0,2.0,5.0,1.0,2.0
pyrexia,False,False,False,False,False
meconium,False,False,True,False,False
apgar5,10.0,9.0,10.0,9.0,10.0
bmi,18,20,27,33,25
age,27,25,34,29,37
gest,38,40,36,26,41
ethnic_white,False,False,False,False,False


## Rebalance data

In [13]:
rebalanced_data = []
sample_size = 10000
for apgar in range(11):
    mask = data['apgar5'] == apgar
    sampled_data = data[mask].sample(sample_size, replace=True)
    rebalanced_data.append(sampled_data)

rebalanced_data = pd.concat(rebalanced_data)
data = rebalanced_data.sample(frac=1.0, random_state=42, replace=False)
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)

In [14]:
y = data['apgar5']
X = data.drop('apgar5', axis=1)

In [15]:
# Set up splits
number_of_splits = 3
skf = StratifiedKFold(n_splits = number_of_splits)
skf.get_n_splits(X, y)

3

In [16]:
# Create list to store accuracies and chosen features
roc_auc_by_feature_number = []
roc_auc_by_feature_number_kfold = []
chosen_features = []
best_auc = 0

# Initialise chosen features list and run tracker
available_features = list(X)
number_of_features = len(available_features)

# Loop through number of features
for i in range (20):
    
    # Reset best feature and accuracy
    best_result = 0
    best_feature = ''
    
    # Loop through available features
    for feature in available_features:

        # Create copy of already chosen features to avoid original being changed
        features_to_use = chosen_features.copy()
        # Create a list of features from features already chosen + 1 new feature
        features_to_use.append(feature)
        
        # Set up a list to hold AUC results for this feature for each kfold
        feature_roc_auc_ovr_kfold = []
        
        # Loop through the k-fold splits
        counter = 0
        for train_index, test_index in skf.split(X, y):
            counter += 1
    
            # Get X and Y train/test
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y.loc[train_index], y.loc[test_index]            
            
            # Restrict features
            X_train = X_train[features_to_use]
            X_test = X_test[features_to_use]

            # Define model
            model = XGBClassifier(verbosity = 0, seed=42, learning_rate=0.5)

            # Fit model
            model.fit(X_train, y_train)
            
            # Get target categories from model
            classes = model.classes_

            # Get predicted probabilities
            y_probs = model.predict_proba(X_test)
            
            # Calculate ROC AUC for multiclass models, using One vs Rest
            feature_roc_auc_ovr = roc_auc_score(y_test, y_probs, labels = classes, 
                                multi_class = 'ovr', average = 'macro')
            feature_roc_auc_ovr_kfold.append(feature_roc_auc_ovr)

            # Remove model
            del model
        
        # Get average result from all k-fold splits``
        feature_auc_mean = np.mean(feature_roc_auc_ovr_kfold)
    
        # Update chosen feature and result if this feature is a new best
        if feature_auc_mean > best_result:
            best_result = feature_auc_mean
            best_result_kfold = feature_roc_auc_ovr_kfold
            best_feature = feature
            
    # k-fold splits are complete    
    # Add mean accuracy and AUC to record of accuracy by feature number
    roc_auc_by_feature_number.append(best_result)
    roc_auc_by_feature_number_kfold.append(best_result_kfold)
    chosen_features.append(best_feature)
    available_features.remove(best_feature)
            
    print (f'Feature {i+1:2.0f}: {best_feature}, AUC: {best_result:0.3f}')

Feature  1: gest, AUC: 0.616
Feature  2: age, AUC: 0.743
Feature  3: bmi, AUC: 0.846
Feature  4: noscans, AUC: 0.872
Feature  5: analgesia_G, AUC: 0.888
Feature  6: parity, AUC: 0.900
Feature  7: ctg_A, AUC: 0.907
Feature  8: analgesia_I, AUC: 0.911
Feature  9: analgesia_P, AUC: 0.916
Feature 10: meconium, AUC: 0.919
Feature 11: ethnic_white, AUC: 0.921
Feature 12: delivery_S, AUC: 0.925
Feature 13: induction_O, AUC: 0.926
Feature 14: onset_S, AUC: 0.928
Feature 15: ctg_Z, AUC: 0.929
Feature 16: analgesia_E, AUC: 0.930
Feature 17: analgesia_O, AUC: 0.931
Feature 18: induction_A, AUC: 0.932
Feature 19: analgesia_L, AUC: 0.933
Feature 20: delivery_A, AUC: 0.933


In [17]:
s = pd.Series(chosen_features)
s.to_csv('./output/feature_selection_full_model_rebalanced.csv', header=False)