In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
ace_data = pd.read_pickle("../data/../data/ace_data_extra_feats.pkl")

In [3]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   hospital_reqd        499 non-null    int64   
 1   referral_from        498 non-null    category
 2   age                  499 non-null    int64   
 3   address              499 non-null    category
 4   gender               499 non-null    category
 5   referral_date        499 non-null    category
 6   referral_time        499 non-null    category
 7   illness_severity     497 non-null    category
 8   activity_level       496 non-null    category
 9   gut_feeling          494 non-null    category
 10  ox_sat               489 non-null    float64 
 11  resp_rate            488 non-null    float64 
 12  heart_rate           490 non-null    float64 
 13  temp                 438 non-null    float64 
 14  sepsis               499 non-null    category
 15  safeguarding         49

In [4]:
pd.set_option("display.max_columns", None)
ace_data.head()

Unnamed: 0,hospital_reqd,referral_from,age,address,gender,referral_date,referral_time,illness_severity,activity_level,gut_feeling,ox_sat,resp_rate,heart_rate,temp,sepsis,safeguarding,food_allergy,drug_allergy,other_allergy,simple_ethnicity,group_ethnicity,ox_sat_low,age_range,ace_heart_rate_cat,ace_resp_rate_cat,meets_ace_criteria,apls_heart_rate_cat,apls_resp_rate_cat
0,0,CCDA,8,BD07,F,Winter,Morning,Moderate,usual,low concern,97.0,20.0,118.0,36.5,None noted,N,N,N,N,other,asian,N,primary,normal,normal,N,normal,normal
1,0,A&E,11,BD03,F,Winter,Morning,Mild,lower,low concern,96.0,20.0,109.0,37.0,None noted,N,N,N,N,Pakistani,asian,N,primary,normal,normal,Y,normal,normal
2,0,CCDA,3,BD04,F,Winter,Afternoon,Mild,usual,well,96.0,28.0,140.0,37.0,None noted,N,N,N,N,other,european,N,pre_school,normal,normal,Y,normal,normal
3,0,GP,3,BD06,M,Winter,Afternoon,Mild,usual,low concern,98.0,28.0,104.0,36.8,None noted,N,N,N,N,British,european,N,pre_school,normal,normal,Y,normal,normal
4,0,GP,3,BD09,M,Winter,Afternoon,Mild,usual,well,97.0,,,,None noted,N,N,N,N,Pakistani,asian,N,pre_school,normal,normal,Y,normal,normal


## split data into examples with nas and complete examples:

In [5]:
na_examples_mask = ace_data.isna().any(axis=1)
na_ace_data = ace_data[na_examples_mask]
clean_ace_data = ace_data[~na_examples_mask]

## train test split the examples without na values:

maintain proportion of hospital required examples

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    clean_ace_data.drop("hospital_reqd", axis=1),
    clean_ace_data.hospital_reqd,
    test_size=0.33,
    stratify=clean_ace_data.hospital_reqd,
    random_state=1)

## explore and clean the na data:

In [7]:
na_counts = na_ace_data.isna().sum(axis=1)
for i in sorted(na_counts.unique()):
    print(f"{sum(na_counts == i)} examples are missing {i} features")
    print(f"{na_ace_data[na_counts == i].hospital_reqd.sum()} require hospital treatment\n")

59 examples are missing 1 features
14 require hospital treatment

3 examples are missing 2 features
0 require hospital treatment

2 examples are missing 3 features
0 require hospital treatment

6 examples are missing 4 features
0 require hospital treatment

1 examples are missing 7 features
0 require hospital treatment



there are 12 examples that have more than one nan value - all of these are children
not requiring hospital care - therefore remove

other examples with only one na value: na can be inferred from category level mean and added

In [8]:
na_ace_data = na_ace_data[na_counts == 1]
missing_features = na_ace_data.isna().sum(axis=0)

for feature in missing_features.index:
    if missing_features[feature] > 0:
        print(f"{missing_features[feature]} examples are missing {feature}\n")

1 examples are missing activity_level

3 examples are missing gut_feeling

2 examples are missing ox_sat

2 examples are missing resp_rate

1 examples are missing heart_rate

50 examples are missing temp



activity_level and gut_feeling can be set to overall mode

In [9]:
activity_is_na = na_ace_data.activity_level.isna()
na_ace_data.loc[activity_is_na, "activity_level"] = ace_data.activity_level.mode()[0]

gut_is_na = na_ace_data.gut_feeling.isna()
na_ace_data.loc[gut_is_na, "gut_feeling"] = ace_data.gut_feeling.mode().values[0]

ox_sat and temp can be set to overall mean

In [10]:
ox_is_na = na_ace_data.ox_sat.isna()
na_ace_data.loc[ox_is_na, "ox_sat"] = ace_data.ox_sat.mean()

temp_is_na = na_ace_data.temp.isna()
na_ace_data.loc[temp_is_na, "temp"] = ace_data.temp.mean()


heart_rate and resp rate can be set to mean for age_range

In [11]:
hr_is_na = na_ace_data.heart_rate.isna()
na_ace_data.loc[hr_is_na, "heart_rate"] = na_ace_data[hr_is_na]["age_range"].apply(
   lambda age_range: ace_data[ace_data.age_range == age_range].heart_rate.mean()
).astype("float")


In [12]:
resp_is_na = na_ace_data.resp_rate.isna()
na_ace_data.loc[resp_is_na, "resp_rate"] = na_ace_data[resp_is_na]["age_range"].apply(
   lambda age_range: ace_data[ace_data.age_range == age_range].resp_rate.mean()
).astype("float")

concat na_examples / labels to X_train / y_train

In [13]:
X_train = (pd.concat([X_train, na_ace_data.drop("hospital_reqd", axis=1)])
           .reset_index(drop=True))
y_train = (pd.concat([y_train, na_ace_data.hospital_reqd])
           .reset_index(drop=True))

## Oversampling to address imbalance in positive / negative hospital required

oversample X_train using SMOTE from imblearn package

In [16]:
from imblearn.over_sampling import SMOTENC

cat_feature_idxs = []
for i, col in enumerate(X_train.columns):
    if not X_train[col].dtype in ["int", "float"]:
        cat_feature_idxs.append(i)

smote = SMOTENC(random_state=1,
                categorical_features=cat_feature_idxs)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

## One Hot Encode Features for numeric modelling methods
## include scaled versions of data for models that will benefit

In [17]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

cat_features = [feature for feature in X_train.columns
                if X_train[feature].dtype.name == "category"]
oh_enc = OneHotEncoder(sparse=False).fit(X_train[cat_features])

num_features = [feature for feature in X_train.columns
                if feature not in cat_features]
mm_scaler = MinMaxScaler().fit(X_train[num_features])

one_hot_feature_names = []
for feature, categories in zip(cat_features, oh_enc.categories_):
    for category in categories:
        name = feature + '_' + category
        one_hot_feature_names.append(name)

ohe_dfs = []
scaled_ohe_dfs = []
for df in [X_train, X_train_res, X_test]:
    oh_data = pd.DataFrame(oh_enc.transform(df[cat_features]),
                           columns=one_hot_feature_names)
    num_data = df[num_features].reset_index(drop=True)
    scaled_num_data = pd.DataFrame(mm_scaler.transform(num_data),
                                   columns=num_features)

    df_ohe = (pd.concat([oh_data, num_data], axis=1)
              .reset_index(drop=True))
    ohe_dfs.append(df_ohe)

    df_ohe_scaled = (pd.concat([oh_data, scaled_num_data], axis=1)
                     .reset_index(drop=True))
    scaled_ohe_dfs.append(df_ohe_scaled)

X_train_ohe, X_train_res_ohe, X_test_ohe = ohe_dfs
X_train_ohe_scaled, X_train_res_ohe_scaled, X_test_ohe_scaled = scaled_ohe_dfs

In [18]:
rnd_idxs = np.random.choice(X_train_res.index, 5)

In [25]:
X_train_res.loc[rnd_idxs]

Unnamed: 0,referral_from,age,address,gender,referral_date,referral_time,illness_severity,activity_level,gut_feeling,ox_sat,resp_rate,heart_rate,temp,sepsis,safeguarding,food_allergy,drug_allergy,other_allergy,simple_ethnicity,group_ethnicity,ox_sat_low,age_range,ace_heart_rate_cat,ace_resp_rate_cat,meets_ace_criteria,apls_heart_rate_cat,apls_resp_rate_cat
232,A&E,7,BD15,M,Winter,Afternoon,Mild,lower,well,99.0,24.0,87.0,36.6,None noted,N,N,N,N,British,european,N,primary,normal,normal,Y,normal,normal
180,GP,5,BD10,M,Autumn,Morning,Mild,usual,well,97.0,22.0,118.0,37.3,None noted,N,N,N,Y,British,european,N,primary,normal,normal,Y,normal,normal
338,GP,2,BD02,M,Autumn,Afternoon,Moderate,usual,well,96.0,44.0,102.0,36.874429,None noted,Y,N,N,N,Pakistani,asian,N,pre_school,normal,high,N,normal,high
205,GP,4,BD09,M,Autumn,Morning,Mild,lower,low concern,95.0,30.0,125.0,36.5,None noted,N,N,N,N,Pakistani,asian,N,pre_school,normal,normal,Y,normal,normal
181,GP,10,BD07,M,Summer,Afternoon,Moderate,usual,low concern,98.0,17.0,84.0,37.3,None noted,Y,Y,N,N,Pakistani,asian,N,primary,normal,low,N,normal,normal


In [28]:
X_train_res_ohe_scaled.loc[rnd_idxs]

Unnamed: 0,referral_from_A&E,referral_from_CCDA,referral_from_ED,referral_from_GP,address_BD01,address_BD02,address_BD03,address_BD04,address_BD05,address_BD06,address_BD07,address_BD08,address_BD09,address_BD10,address_BD11,address_BD12,address_BD13,address_BD14,address_BD15,address_BD16,address_BD17,address_BD18,address_BD19,address_LS20,address_LS29,gender_F,gender_M,referral_date_Autumn,referral_date_Spring,referral_date_Summer,referral_date_Winter,referral_time_Afternoon,referral_time_Evening,referral_time_Morning,illness_severity_Mild,illness_severity_Moderate,activity_level_lower,activity_level_usual,gut_feeling_low concern,gut_feeling_unwell,gut_feeling_well,sepsis_Low level,sepsis_None noted,safeguarding_N,safeguarding_Y,food_allergy_N,food_allergy_Y,drug_allergy_N,drug_allergy_Y,other_allergy_N,other_allergy_Y,simple_ethnicity_British,simple_ethnicity_Pakistani,simple_ethnicity_other,group_ethnicity_asian,group_ethnicity_european,group_ethnicity_other,ox_sat_low_N,ox_sat_low_Y,age_range_pre_school,age_range_primary,age_range_secondary,ace_heart_rate_cat_high,ace_heart_rate_cat_low,ace_heart_rate_cat_normal,ace_resp_rate_cat_high,ace_resp_rate_cat_low,ace_resp_rate_cat_normal,meets_ace_criteria_N,meets_ace_criteria_Y,apls_heart_rate_cat_high,apls_heart_rate_cat_low,apls_heart_rate_cat_normal,apls_resp_rate_cat_high,apls_resp_rate_cat_low,apls_resp_rate_cat_normal,age,ox_sat,resp_rate,heart_rate,temp
232,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.428571,0.28125,0.294118,0.157407,0.351351
180,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.285714,0.21875,0.235294,0.444444,0.540541
338,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.071429,0.1875,0.882353,0.296296,0.425521
205,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.214286,0.15625,0.470588,0.509259,0.324324
181,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.642857,0.25,0.088235,0.12963,0.540541


## Target encode categorical features

In [30]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

target_enc = LeaveOneOutEncoder(cols=cat_features).fit(X_train, y_train)

target_encd_dfs = []
scaled_target_encd_dfs = []
for df in [X_train, X_train_res, X_test]:
    target_encd_df = target_enc.transform(df).reset_index(drop=True)
    target_encd_dfs.append(target_encd_df)

    num_data = df[num_features].reset_index(drop=True)
    scaled_num_data = pd.DataFrame(mm_scaler.transform(num_data),
                                   columns=num_features)
    scaled_target_encd_df = pd.concat(
        [target_encd_df[cat_features], scaled_num_data],
        axis=1
    )
    scaled_target_encd_dfs.append(scaled_target_encd_df)

X_train_target, X_train_res_target, X_test_target = target_encd_dfs
(X_train_target_scaled,
 X_train_res_target_scaled,
 X_test_target_scaled) = scaled_target_encd_dfs

  elif pd.api.types.is_categorical(cols):


In [43]:
rnd_idxs = np.random.choice(X_train_res.index, 5)

In [44]:
X_train_res.loc[rnd_idxs]

Unnamed: 0,referral_from,age,address,gender,referral_date,referral_time,illness_severity,activity_level,gut_feeling,ox_sat,resp_rate,heart_rate,temp,sepsis,safeguarding,food_allergy,drug_allergy,other_allergy,simple_ethnicity,group_ethnicity,ox_sat_low,age_range,ace_heart_rate_cat,ace_resp_rate_cat,meets_ace_criteria,apls_heart_rate_cat,apls_resp_rate_cat
403,A&E,2,BD09,M,Winter,Morning,Mild,usual,low concern,97.0,28.804678,102.396492,36.884643,None noted,N,N,N,N,Pakistani,asian,N,primary,normal,normal,N,normal,normal
518,GP,2,BD09,F,Winter,Morning,Mild,usual,well,96.411354,30.411354,132.468126,36.948346,None noted,N,N,N,N,Pakistani,asian,N,pre_school,normal,high,N,normal,high
321,GP,2,BD09,M,Spring,Evening,Moderate,usual,well,97.0,32.0,100.0,36.874429,None noted,N,N,N,N,Pakistani,asian,N,pre_school,normal,high,N,normal,high
279,A&E,13,BD08,M,Winter,Afternoon,Mild,usual,low concern,96.0,20.0,96.0,36.8,None noted,N,N,N,N,British,european,N,secondary,normal,normal,Y,normal,normal
484,A&E,3,BD09,M,Spring,Morning,Mild,usual,low concern,96.020722,26.937834,98.165776,36.606217,None noted,N,N,N,N,Pakistani,asian,N,primary,normal,high,N,normal,normal


In [45]:
X_train_res_target.loc[rnd_idxs]

Unnamed: 0,referral_from,age,address,gender,referral_date,referral_time,illness_severity,activity_level,gut_feeling,ox_sat,resp_rate,heart_rate,temp,sepsis,safeguarding,food_allergy,drug_allergy,other_allergy,simple_ethnicity,group_ethnicity,ox_sat_low,age_range,ace_heart_rate_cat,ace_resp_rate_cat,meets_ace_criteria,apls_heart_rate_cat,apls_resp_rate_cat
403,0.179104,2,0.195652,0.174528,0.181818,0.170984,0.151724,0.16087,0.162437,97.0,28.804678,102.396492,36.884643,0.164087,0.176667,0.166124,0.179811,0.171429,0.173653,0.177419,0.172515,0.160839,0.169173,0.150259,0.20283,0.159468,0.148289
518,0.199029,2,0.195652,0.165414,0.181818,0.170984,0.151724,0.16087,0.172414,96.411354,30.411354,132.468126,36.948346,0.164087,0.176667,0.166124,0.179811,0.171429,0.173653,0.177419,0.172515,0.172043,0.169173,0.20354,0.20283,0.159468,0.25641
321,0.199029,2,0.195652,0.174528,0.147541,0.307692,0.272727,0.16087,0.172414,97.0,32.0,100.0,36.874429,0.164087,0.176667,0.166124,0.179811,0.171429,0.173653,0.177419,0.172515,0.172043,0.169173,0.20354,0.20283,0.159468,0.25641
279,0.179104,13,0.068966,0.174528,0.181818,0.158273,0.151724,0.16087,0.162437,96.0,20.0,96.0,36.8,0.164087,0.176667,0.166124,0.179811,0.171429,0.181034,0.186047,0.172515,0.25,0.169173,0.150259,0.120301,0.159468,0.148289
484,0.179104,3,0.195652,0.174528,0.147541,0.170984,0.151724,0.16087,0.162437,96.020722,26.937834,98.165776,36.606217,0.164087,0.176667,0.166124,0.179811,0.171429,0.173653,0.177419,0.172515,0.160839,0.169173,0.20354,0.20283,0.159468,0.148289


In [47]:
def targety(feature, value):
    return y_train[X_train[feature] == value].mean()

targety("referral_date", "Spring")

0.14754098360655737

In [58]:
train_test_data_dir = "../data/train_test_data/"

dfs_dict = {
    "X_train": X_train,
    "y_train": y_train,
    "X_train_res": X_train_res,
    "y_train_res": y_train_res,
    "X_test": X_test,
    "y_test": y_test,
    "X_train_ohe": X_train_ohe,
    "X_test_ohe": X_test_ohe,
    "X_train_target": X_train_target,
    "X_test_target": X_test_target,
    "X_train_ohe_scaled": X_train_ohe_scaled,
    "X_test_ohe_scaled": X_test_ohe_scaled,
    "X_train_target_scaled": X_train_target_scaled,
    "X_test_target_scaled": X_test_target_scaled,
    "X_train_res_ohe": X_train_res_ohe,
    "X_train_res_target": X_train_res_target,
    "X_train_res_ohe_scaled": X_train_res_ohe_scaled,
    "X_train_res_target_scaled": X_train_res_target_scaled,
}

for name, df in dfs_dict.items():
    df.to_pickle(train_test_data_dir + name + ".pkl")