In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
ace_data = pd.read_pickle("../data/ace_data_prepped.pkl")

In [3]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   id                   499 non-null    int64   
 1   hospital_reqd        499 non-null    int64   
 2   referral_from        498 non-null    category
 3   age                  499 non-null    int64   
 4   address              499 non-null    category
 5   ethnicity            497 non-null    category
 6   gender               499 non-null    category
 7   referral_date        499 non-null    category
 8   referral_time        499 non-null    category
 9   illness_severity     497 non-null    category
 10  activity_level       496 non-null    category
 11  gut_feeling          494 non-null    category
 12  ox_sat               489 non-null    float64 
 13  resp_rate            488 non-null    float64 
 14  heart_rate           490 non-null    float64 
 15  temp                 43

In [4]:
ace_data.head()

Unnamed: 0,id,hospital_reqd,referral_from,age,address,ethnicity,gender,referral_date,referral_time,illness_severity,...,other_allergy,simple_ethnicity,group_ethnicity,ox_sat_low,age_range,ace_heart_rate_cat,ace_resp_rate_cat,meets_ace_criteria,apls_heart_rate_cat,apls_resp_rate_cat
0,1,0,CCDA,8,BD07,Indian,F,Winter,Morning,Moderate,...,N,other,asian,N,primary,normal,normal,N,normal,normal
1,2,0,A&E,11,BD03,Pakistani,F,Winter,Morning,Mild,...,N,Pakistani,asian,N,primary,normal,normal,Y,normal,normal
2,3,0,CCDA,3,BD04,Slovak,F,Winter,Afternoon,Mild,...,N,other,european,N,pre_school,normal,normal,Y,normal,normal
3,4,0,GP,3,BD06,British,M,Winter,Afternoon,Mild,...,N,British,european,N,pre_school,normal,normal,Y,normal,normal
4,5,0,GP,3,BD09,Pakistani,M,Winter,Afternoon,Mild,...,N,Pakistani,asian,N,pre_school,normal,normal,Y,normal,normal


### split data into examples with nas and complete examples:

In [5]:
na_examples_mask = ace_data.isna().any(axis=1)
na_ace_data = ace_data[na_examples_mask]
clean_ace_data = ace_data[~na_examples_mask]

### train test split the clean examples:

maintain proportion of hospital required examples

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    clean_ace_data.drop("hospital_reqd", axis=1),
    clean_ace_data.hospital_reqd,
    test_size=0.33,
    stratify=clean_ace_data.hospital_reqd,
    random_state=1)

### explore and fix na data:

In [7]:
na_counts = na_ace_data.isna().sum(axis=1)
for i in sorted(na_counts.unique()):
    print(f"{sum(na_counts == i)} examples are missing {i} features")
    print(f"{na_ace_data[na_counts == i].hospital_reqd.sum()} require hospital treatment\n")

61 examples are missing 1 features
14 require hospital treatment

3 examples are missing 2 features
0 require hospital treatment

2 examples are missing 3 features
0 require hospital treatment

6 examples are missing 4 features
0 require hospital treatment

1 examples are missing 7 features
0 require hospital treatment



there are 12 examples that have more than one nan value - all of these are children
not requiring hospital care - therefore remove

other examples with only one na value: na can be inferred from age group mean and added

In [8]:
na_ace_data = na_ace_data[na_counts == 1]
missing_features = na_ace_data.isna().sum(axis=0)

for feature in missing_features.index:
    if missing_features[feature] > 0:
        print(f"{missing_features[feature]} examples are missing {feature}\n")

2 examples are missing ethnicity

1 examples are missing activity_level

3 examples are missing gut_feeling

2 examples are missing ox_sat

2 examples are missing resp_rate

1 examples are missing heart_rate

50 examples are missing temp



ethnicity na = "Not Stated" (one of existing categories - simple and group ethnicity already correctly classified as "other")

In [9]:
na_ace_data.loc[na_ace_data.ethnicity.isna(), "ethnicity"] = "Not Stated"

activity_level and gut_feeling can be set to overall mode

In [10]:
activity_is_na = na_ace_data.activity_level.isna()
na_ace_data.loc[activity_is_na, "activity_level"] = ace_data.activity_level.mode()[0]

gut_is_na = na_ace_data.gut_feeling.isna()
na_ace_data.loc[gut_is_na, "gut_feeling"] = ace_data.gut_feeling.mode().values[0]

ox_sat and temp can be set to overall mean

In [11]:
ox_is_na = na_ace_data.ox_sat.isna()
na_ace_data.loc[ox_is_na, "ox_sat"] = ace_data.ox_sat.mean()

temp_is_na = na_ace_data.temp.isna()
na_ace_data.loc[temp_is_na, "temp"] = ace_data.temp.mean()


heart_rate and resp rate can be set to mean for age_range

In [12]:
hr_is_na = na_ace_data.heart_rate.isna()
na_ace_data.loc[hr_is_na, "heart_rate"] = na_ace_data[hr_is_na]["age_range"].apply(
   lambda age_range: ace_data[ace_data.age_range == age_range].heart_rate.mean()
).astype("float")


In [13]:
resp_is_na = na_ace_data.resp_rate.isna()
na_ace_data.loc[resp_is_na, "resp_rate"] = na_ace_data[resp_is_na]["age_range"].apply(
   lambda age_range: ace_data[ace_data.age_range == age_range].resp_rate.mean()
).astype("float")

concat na_examples / labels to X_train / y_train

In [14]:
X_train = pd.concat([X_train, na_ace_data.drop("hospital_reqd", axis=1)])
y_train = pd.concat([y_train, na_ace_data.hospital_reqd])

oversample X_train using SMOTE to address imbalance in examples requiring hospital treatment

In [15]:
from imblearn.over_sampling import SMOTENC

cat_feature_idxs = []
for i, col in enumerate(X_train.columns):
    if not X_train[col].dtype in ["int", "float"]:
        cat_feature_idxs.append(i)

smote = SMOTENC(random_state=1,
                categorical_features=cat_feature_idxs)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [16]:
X_train.to_pickle("../data/X_train.pkl")
y_train.to_pickle("../data/y_train.pkl")
X_train_res.to_pickle("../data/X_train_res.pkl")
y_train_res.to_pickle("../data/y_train_res.pkl")
X_test.to_pickle("../data/X_test.pkl")
y_test.to_pickle("../data/y_test.pkl")