In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:

ace_data = pd.read_pickle("../data/../data/ace_data_extra_feats.pkl")

In [3]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   hospital_reqd        499 non-null    int64   
 1   referral_from        498 non-null    category
 2   age                  499 non-null    int64   
 3   address              499 non-null    category
 4   gender               499 non-null    category
 5   referral_date        499 non-null    category
 6   referral_time        499 non-null    category
 7   illness_severity     497 non-null    category
 8   activity_level       496 non-null    category
 9   gut_feeling          494 non-null    category
 10  ox_sat               489 non-null    float64 
 11  resp_rate            488 non-null    float64 
 12  heart_rate           490 non-null    float64 
 13  temp                 438 non-null    float64 
 14  sepsis               499 non-null    category
 15  safeguarding         49

In [4]:
ace_data.head()

Unnamed: 0,hospital_reqd,referral_from,age,address,gender,referral_date,referral_time,illness_severity,activity_level,gut_feeling,...,other_allergy,simple_ethnicity,group_ethnicity,ox_sat_low,age_range,ace_heart_rate_cat,ace_resp_rate_cat,meets_ace_criteria,apls_heart_rate_cat,apls_resp_rate_cat
0,0,CCDA,8,BD07,F,Winter,Morning,Moderate,usual,low concern,...,N,other,asian,N,primary,normal,normal,N,normal,normal
1,0,A&E,11,BD03,F,Winter,Morning,Mild,lower,low concern,...,N,Pakistani,asian,N,primary,normal,normal,Y,normal,normal
2,0,CCDA,3,BD04,F,Winter,Afternoon,Mild,usual,well,...,N,other,european,N,pre_school,normal,normal,Y,normal,normal
3,0,GP,3,BD06,M,Winter,Afternoon,Mild,usual,low concern,...,N,British,european,N,pre_school,normal,normal,Y,normal,normal
4,0,GP,3,BD09,M,Winter,Afternoon,Mild,usual,well,...,N,Pakistani,asian,N,pre_school,normal,normal,Y,normal,normal


## split data into examples with nas and complete examples:

In [5]:
na_examples_mask = ace_data.isna().any(axis=1)
na_ace_data = ace_data[na_examples_mask]
clean_ace_data = ace_data[~na_examples_mask]

## train test split the examples without na values:

maintain proportion of hospital required examples

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    clean_ace_data.drop("hospital_reqd", axis=1),
    clean_ace_data.hospital_reqd,
    test_size=0.33,
    stratify=clean_ace_data.hospital_reqd,
    random_state=1)

## explore and clean the na data:

In [7]:
na_counts = na_ace_data.isna().sum(axis=1)
for i in sorted(na_counts.unique()):
    print(f"{sum(na_counts == i)} examples are missing {i} features")
    print(f"{na_ace_data[na_counts == i].hospital_reqd.sum()} require hospital treatment\n")

59 examples are missing 1 features
14 require hospital treatment

3 examples are missing 2 features
0 require hospital treatment

2 examples are missing 3 features
0 require hospital treatment

6 examples are missing 4 features
0 require hospital treatment

1 examples are missing 7 features
0 require hospital treatment



there are 12 examples that have more than one nan value - all of these are children
not requiring hospital care - therefore remove

other examples with only one na value: na can be inferred from age group mean and added

In [8]:
na_ace_data = na_ace_data[na_counts == 1]
missing_features = na_ace_data.isna().sum(axis=0)

for feature in missing_features.index:
    if missing_features[feature] > 0:
        print(f"{missing_features[feature]} examples are missing {feature}\n")

1 examples are missing activity_level

3 examples are missing gut_feeling

2 examples are missing ox_sat

2 examples are missing resp_rate

1 examples are missing heart_rate

50 examples are missing temp



activity_level and gut_feeling can be set to overall mode

In [9]:
activity_is_na = na_ace_data.activity_level.isna()
na_ace_data.loc[activity_is_na, "activity_level"] = ace_data.activity_level.mode()[0]

gut_is_na = na_ace_data.gut_feeling.isna()
na_ace_data.loc[gut_is_na, "gut_feeling"] = ace_data.gut_feeling.mode().values[0]

ox_sat and temp can be set to overall mean

In [10]:
ox_is_na = na_ace_data.ox_sat.isna()
na_ace_data.loc[ox_is_na, "ox_sat"] = ace_data.ox_sat.mean()

temp_is_na = na_ace_data.temp.isna()
na_ace_data.loc[temp_is_na, "temp"] = ace_data.temp.mean()


heart_rate and resp rate can be set to mean for age_range

In [11]:
hr_is_na = na_ace_data.heart_rate.isna()
na_ace_data.loc[hr_is_na, "heart_rate"] = na_ace_data[hr_is_na]["age_range"].apply(
   lambda age_range: ace_data[ace_data.age_range == age_range].heart_rate.mean()
).astype("float")


In [12]:
resp_is_na = na_ace_data.resp_rate.isna()
na_ace_data.loc[resp_is_na, "resp_rate"] = na_ace_data[resp_is_na]["age_range"].apply(
   lambda age_range: ace_data[ace_data.age_range == age_range].resp_rate.mean()
).astype("float")

concat na_examples / labels to X_train / y_train

In [13]:
X_train = pd.concat([X_train, na_ace_data.drop("hospital_reqd", axis=1)])
y_train = pd.concat([y_train, na_ace_data.hospital_reqd])

## Oversampling to address imbalance in positive / negative hospital required

oversample X_train using SMOTE from imblearn package

In [14]:
from imblearn.over_sampling import SMOTENC

cat_feature_idxs = []
for i, col in enumerate(X_train.columns):
    if not X_train[col].dtype in ["int", "float"]:
        cat_feature_idxs.append(i)

smote = SMOTENC(random_state=1,
                categorical_features=cat_feature_idxs)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

## One Hot Encode Features for numeric modelling methods
## include scaled versions of data for models that will benefit

In [15]:
cat_features = [feature for feature in X_train.columns
                    if X_train[feature].dtype.name == "category"
                    and feature != "ethnicity"]

num_features = [feature for feature in X_train.columns
                if feature not in cat_features
                and feature != "ethnicity"]

In [16]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

mm_scaler = MinMaxScaler().fit(X_train[num_features])

oh_enc = OneHotEncoder(sparse=False).fit(X_train[cat_features])

one_hot_feature_names = []
for feature, categories in zip(cat_features, oh_enc.categories_):
    for category in categories:
        name = feature + '_' + category
        one_hot_feature_names.append(name)

ohe_dfs = []
scaled_ohe_dfs = []
for df in [X_train, X_train_res, X_test]:
    oh_data = pd.DataFrame(oh_enc.transform(df[cat_features]),
                           columns=one_hot_feature_names)
    num_data = df[num_features].reset_index(drop=True)
    scaled_num_data = pd.DataFrame(mm_scaler.transform(num_data),
                                   columns=num_features)

    df_ohe = pd.concat([oh_data, num_data], axis=1)
    ohe_dfs.append(df_ohe)

    df_ohe_scaled = pd.concat([oh_data, scaled_num_data], axis=1)
    scaled_ohe_dfs.append(df_ohe_scaled)

X_train_ohe, X_train_res_ohe, X_test_ohe = ohe_dfs
X_train_ohe_scaled, X_train_res_ohe_scaled, X_test_ohe_scaled = scaled_ohe_dfs

## Target encode categorical features

In [19]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

target_enc = LeaveOneOutEncoder(cols=cat_features).fit(X_train, y_train)

target_encd_dfs = []
scaled_target_encd_dfs = []
for df in [X_train, X_train_res, X_test]:
    target_encd_df = target_enc.transform(df).reset_index(drop=True)
    target_encd_dfs.append(target_encd_df)

    num_data = df[num_features].reset_index(drop=True)
    scaled_num_data = pd.DataFrame(mm_scaler.transform(num_data),
                                   columns=num_features)
    scaled_target_encd_df = pd.concat(
        [target_encd_df.drop(num_features, axis=1), scaled_num_data],
        axis=1
    )
    scaled_target_encd_dfs.append(scaled_target_encd_df)

X_train_target, X_train_res_target, X_test_target = target_encd_dfs
(X_train_target_scaled,
 X_train_res_target_scaled,
 X_test_target_scaled) = scaled_target_encd_dfs

  elif pd.api.types.is_categorical(cols):


In [20]:
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", None)

In [29]:
train_test_data_dir = "../data/train_test_data/"

X_train.to_pickle(train_test_data_dir + "X_train.pkl")
X_train_ohe.to_pickle(train_test_data_dir + "X_train_ohe.pkl")
X_train_target.to_pickle(train_test_data_dir + "X_train_target.pkl")
X_train_ohe_scaled.to_pickle(train_test_data_dir + "X_train_ohe_scaled.pkl")
X_train_target_scaled.to_pickle(train_test_data_dir + "X_train_target_scaled.pkl")
y_train.to_pickle(train_test_data_dir + "y_train.pkl")

X_train_res.to_pickle(train_test_data_dir + "X_train_res.pkl")
X_train_res_ohe.to_pickle(train_test_data_dir + "X_train_res_ohe.pkl")
X_train_res_target.to_pickle(train_test_data_dir + "X_train_res_target.pkl")
X_train_res_ohe_scaled.to_pickle(train_test_data_dir + "X_train_res_ohe_scaled.pkl")
X_train_res_target_scaled.to_pickle(train_test_data_dir + "X_train_res_target_scaled.pkl")
y_train_res.to_pickle(train_test_data_dir + "y_train_res.pkl")

X_test.to_pickle(train_test_data_dir + "X_test.pkl")
X_test_ohe.to_pickle(train_test_data_dir + "X_test_ohe.pkl")
X_test_target.to_pickle(train_test_data_dir + "X_test_target.pkl")
X_test_ohe_scaled.to_pickle(train_test_data_dir + "X_test_ohe_scaled.pkl")
X_test_target_scaled.to_pickle(train_test_data_dir + "X_test_target_scaled.pkl")
y_test.to_pickle(train_test_data_dir + "y_test.pkl")
