In [1]:
import pandas as pd
import numpy as np

In [2]:
ace_data = pd.read_csv("../data/ace_data_orig.csv")

# replace "None" values with nan
ace_data.replace("None", np.nan, inplace=True)

In [3]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   hospital_reqd     499 non-null    object
 1   referral_from     498 non-null    object
 2   age               499 non-null    int64 
 3   address           499 non-null    object
 4   ethnicity         497 non-null    object
 5   gender            499 non-null    object
 6   allergies         499 non-null    object
 7   referral_date     499 non-null    object
 8   referral_time     499 non-null    object
 9   illness_severity  497 non-null    object
 10  activity_level    496 non-null    object
 11  gut_feeling       494 non-null    object
 12  ox_sat            489 non-null    object
 13  resp_rate         488 non-null    object
 14  heart_rate        490 non-null    object
 15  temp              438 non-null    object
 16  sepsis            499 non-null    object
 17  safeguarding    

In [4]:
pd.set_option("display.max_columns", None)
ace_data.head()

Unnamed: 0,hospital_reqd,referral_from,age,address,ethnicity,gender,allergies,referral_date,referral_time,illness_severity,activity_level,gut_feeling,ox_sat,resp_rate,heart_rate,temp,sepsis,safeguarding
0,N,CCDA,8,BD07,Indian,F,NKA,Winter,Morning,Moderate,usual,low concern,97,20.0,118.0,36.5,None noted,N
1,N,A&E,11,BD03,Pakistani,F,NKA,Winter,Morning,Mild,lower,low concern,96,20.0,109.0,37.0,None noted,N
2,N,CCDA,3,BD04,Slovak,F,NKDA,Winter,Afternoon,Mild,usual,well,96,28.0,140.0,37.0,None noted,N
3,N,GP,3,BD06,British,M,NKDA,Winter,Afternoon,Mild,usual,low concern,98,28.0,104.0,36.8,None noted,N
4,N,GP,3,BD09,Pakistani,M,NKA,Winter,Afternoon,Mild,usual,well,97,,,,None noted,N


In [5]:
# clean and convert cat features to categorical datatype
# (other than allergies - treated separately)

cat_features = ['referral_from', 'address', 'ethnicity', 'gender',
                'referral_date', 'referral_time', 'illness_severity',
                'activity_level', 'gut_feeling', 'sepsis', 'safeguarding']

for feature in cat_features:
    ace_data[feature] = (ace_data[feature].str.strip()
                                          .astype("category"))

In [6]:
# refactor allergy feature

for allergy in ["Food", "Drug", "Other"]:
    allergy_name = allergy.lower() + "_allergy"
    ace_data[allergy_name] = ace_data.allergies.apply(
        lambda x: 'Y' if allergy in x else 'N'
    ).astype("category")

# drop original allergy feature as no longer required
ace_data.drop("allergies", axis=1, inplace=True)

In [7]:
# convert numerical features from object to float (did contain "none" values)

float_features = ["ox_sat", "resp_rate", "heart_rate", "temp"]
for feature in float_features:
    ace_data[feature] = ace_data[feature].astype("float")

In [8]:
# set hospital_reqd to 1/0 for data analysis

ace_data["hospital_reqd"] = (ace_data.hospital_reqd == "Y").astype("int")

In [9]:
# new ethnicity features

# set reported ethnicity to other if not British / Pakistani
ace_data["simple_ethnicity"] = ace_data.ethnicity.apply(
    lambda x: x if x in ["Pakistani", "British"] else "other"
).replace(np.nan, "other")

ace_data["simple_ethnicity"] = ace_data.simple_ethnicity.astype("category")

# Group ethnicities into European / Asian / Other
asian = ["Indian", "Mixed Asian", "Pakistani", "white Asain", "British Asian",
         "Asian", "Sri Lankan", "Other Asian background", "Bangladeshi"]

european = ["Slovak", "British", "Other white background", "Czech Republic",
            "White Europeon", "White British", "CommonWealth Russian",
            "Other European", "Mixed White"]

ace_data["group_ethnicity"] = "other"
ace_data.loc[ace_data.ethnicity.isin(european), "group_ethnicity"] = "european"
ace_data.loc[ace_data.ethnicity.isin(asian), "group_ethnicity"] = "asian"

ace_data["group_ethnicity"] = ace_data.group_ethnicity.astype("category")

ace_data.drop("ethnicity", axis=1, inplace=True)

In [10]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   hospital_reqd     499 non-null    int64   
 1   referral_from     498 non-null    category
 2   age               499 non-null    int64   
 3   address           499 non-null    category
 4   gender            499 non-null    category
 5   referral_date     499 non-null    category
 6   referral_time     499 non-null    category
 7   illness_severity  497 non-null    category
 8   activity_level    496 non-null    category
 9   gut_feeling       494 non-null    category
 10  ox_sat            489 non-null    float64 
 11  resp_rate         488 non-null    float64 
 12  heart_rate        490 non-null    float64 
 13  temp              438 non-null    float64 
 14  sepsis            499 non-null    category
 15  safeguarding      499 non-null    category
 16  food_allergy      499 non-

In [11]:
# ox_sat_low feature: < 94 = low
ace_data["ox_sat_low"] = ace_data.ox_sat.apply(
    lambda x: "Y" if x < 94 else "N"
).astype("category")

In [12]:
# age_range feature: "pre_school" = (2-5), "primary" = (5-12), "secondary" (12+)

ace_data["age_range"] = "pre_school"

primary = (ace_data.age >= 5) & (ace_data.age < 12)
ace_data.loc[primary, "age_range"] = "primary"

secondary = ace_data.age >= 12
ace_data.loc[secondary, "age_range"] = "secondary"

ace_data["age_range"] = ace_data.age_range.astype("category")

In [13]:
def set_low_norm_high(feature, feature_range, age_range, new_feature_name):

    if new_feature_name not in ace_data.columns:
        ace_data[new_feature_name] = "normal"

    age_mask = ((ace_data.age >= age_range[0]) &
                (ace_data.age < age_range[1]))
    low_mask = age_mask & (ace_data[feature] < feature_range[0])
    high_mask = age_mask & (ace_data[feature] > feature_range[1])

    ace_data.loc[low_mask, new_feature_name] = "low"
    ace_data.loc[high_mask, new_feature_name] = "high"

In [14]:
# low / high / normal features according to ace and apls criteria

ace_resp_rate_feature = {
    "feature": "resp_rate",
    "age_ranges": [(0,5),(5,12),(12,18)],
    "feature_ranges": [(25,30),(20,25),(15,20)],
    "new_feature_name":"ace_resp_rate_cat"
}

ace_heart_rate_feature = {
    "feature": "heart_rate",
    "age_ranges": [(0,5),(5,12),(12,18)],
    "feature_ranges": [(95,140),(80,120),(60,100)],
    "new_feature_name":"ace_heart_rate_cat"
}

apls_resp_rate_feature = {
    "feature": "resp_rate",
    "age_ranges": [(0,2), (2,8), (8,12), (12,18)],
    "feature_ranges": [(20,40), (20,30), (15,25), (12,24)],
    "new_feature_name": "apls_resp_rate_cat"
}

apls_heart_rate_feature = {
    "feature": "heart_rate",
    "age_ranges": [(0,2), (2,3), (3,4), (4,6), (6,8),
                  (8,12), (12,14), (14,18)],
    "feature_ranges": [(100,160), (100,150), (90,140), (80,135), (80,130),
                      (70,120),(65,115),(60,110)],
    "new_feature_name": "apls_heart_rate_cat"
}

high_low_features = [
    ace_resp_rate_feature,
    ace_heart_rate_feature,
    apls_resp_rate_feature,
    apls_heart_rate_feature
]

for hl_feature in high_low_features:

    feature = hl_feature["feature"]
    age_ranges = hl_feature["age_ranges"]
    feature_ranges = hl_feature["feature_ranges"]
    new_feature_name = hl_feature["new_feature_name"]

    for age_range, feature_range in zip(age_ranges, feature_ranges):
        set_low_norm_high(feature=feature,
                          feature_range=feature_range,
                          age_range=age_range,
                          new_feature_name=new_feature_name)

    ace_data[new_feature_name] = ace_data[new_feature_name].astype("category")

In [15]:
ace_data["meets_ace_criteria"] = "N"

meets_ace_criteria = ((ace_data.ox_sat_low == "N") &
                      (ace_data.ace_heart_rate_cat == "normal") &
                      (ace_data.ace_resp_rate_cat == "normal") &
                      (ace_data.gut_feeling != "unwell") &
                      (ace_data.illness_severity != "Moderate"))

ace_data.loc[meets_ace_criteria, "meets_ace_criteria"] = "Y"
ace_data["meets_ace_criteria"] = ace_data.meets_ace_criteria.astype("category")

In [16]:
# pickle data
ace_data.to_pickle("../data/ace_data_extra_feats.pkl")
ace_data.to_csv("../data/ace_data_extra_feats.csv")

In [17]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   hospital_reqd        499 non-null    int64   
 1   referral_from        498 non-null    category
 2   age                  499 non-null    int64   
 3   address              499 non-null    category
 4   gender               499 non-null    category
 5   referral_date        499 non-null    category
 6   referral_time        499 non-null    category
 7   illness_severity     497 non-null    category
 8   activity_level       496 non-null    category
 9   gut_feeling          494 non-null    category
 10  ox_sat               489 non-null    float64 
 11  resp_rate            488 non-null    float64 
 12  heart_rate           490 non-null    float64 
 13  temp                 438 non-null    float64 
 14  sepsis               499 non-null    category
 15  safeguarding         49

In [18]:
ace_data.sample(10)

Unnamed: 0,hospital_reqd,referral_from,age,address,gender,referral_date,referral_time,illness_severity,activity_level,gut_feeling,ox_sat,resp_rate,heart_rate,temp,sepsis,safeguarding,food_allergy,drug_allergy,other_allergy,simple_ethnicity,group_ethnicity,ox_sat_low,age_range,ace_resp_rate_cat,ace_heart_rate_cat,apls_resp_rate_cat,apls_heart_rate_cat,meets_ace_criteria
329,0,A&E,6,BD18,M,Summer,Morning,Mild,usual,well,96.0,26.0,112.0,36.6,None noted,N,N,N,N,Pakistani,asian,N,primary,high,normal,normal,normal,N
385,1,GP,4,BD17,F,Autumn,Afternoon,Mild,usual,well,95.0,36.0,132.0,36.0,None noted,N,N,N,N,British,european,N,pre_school,high,normal,high,normal,N
334,0,ED,7,BD03,F,Summer,Morning,Mild,usual,well,95.0,24.0,125.0,38.3,None noted,N,N,N,N,other,european,N,primary,normal,high,normal,normal,N
102,0,A&E,5,BD05,M,Spring,Morning,Mild,usual,well,96.0,35.0,142.0,36.8,None noted,N,N,N,N,British,european,N,primary,high,high,high,high,N
291,0,CCDA,3,BD09,M,Winter,Afternoon,Mild,usual,well,94.0,31.0,112.0,36.3,None noted,N,N,Y,N,Pakistani,asian,N,pre_school,high,normal,high,normal,N
151,1,A&E,3,BD13,M,Summer,Evening,Mild,usual,,97.0,24.0,106.0,36.9,None noted,N,Y,N,N,other,european,N,pre_school,low,normal,normal,normal,N
321,0,A&E,2,BD06,M,Spring,Morning,Mild,usual,well,95.0,31.0,142.0,,None noted,N,N,N,N,British,european,N,pre_school,high,high,high,normal,N
435,0,GP,5,BD09,F,Autumn,Morning,Mild,lower,low concern,97.0,28.0,132.0,36.7,None noted,Y,Y,N,N,Pakistani,asian,N,primary,high,high,normal,normal,N
389,0,GP,5,BD09,M,Autumn,Afternoon,Mild,usual,well,96.0,24.0,120.0,36.0,None noted,N,N,N,N,Pakistani,asian,N,primary,normal,normal,normal,normal,Y
97,0,GP,12,BD07,F,Spring,Morning,Mild,lower,well,98.0,20.0,90.0,36.7,None noted,N,N,N,N,Pakistani,asian,N,secondary,normal,normal,normal,normal,Y
