In [2]:
import pandas as pd
import numpy as np

In [3]:
ace_data = pd.read_csv("../data/ace_referral_data.csv")

# replace "None" values with nan
ace_data.replace("None", np.nan, inplace=True)

In [4]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                499 non-null    int64 
 1   hospital_reqd     499 non-null    object
 2   referral_from     498 non-null    object
 3   age               499 non-null    int64 
 4   address           499 non-null    object
 5   ethnicity         497 non-null    object
 6   gender            499 non-null    object
 7   allergies         499 non-null    object
 8   referral_date     499 non-null    object
 9   referral_time     499 non-null    object
 10  illness_severity  497 non-null    object
 11  activity_level    496 non-null    object
 12  gut_feeling       494 non-null    object
 13  ox_sat            489 non-null    object
 14  resp_rate         488 non-null    object
 15  heart_rate        490 non-null    object
 16  temp              438 non-null    object
 17  sepsis          

In [5]:
ace_data.head()

Unnamed: 0,id,hospital_reqd,referral_from,age,address,ethnicity,gender,allergies,referral_date,referral_time,illness_severity,activity_level,gut_feeling,ox_sat,resp_rate,heart_rate,temp,sepsis,safeguarding
0,1,N,CCDA,8,BD07,Indian,F,NKA,Winter,Morning,Moderate,usual,low concern,97,20.0,118.0,36.5,None noted,N
1,2,N,A&E,11,BD03,Pakistani,F,NKA,Winter,Morning,Mild,lower,low concern,96,20.0,109.0,37.0,None noted,N
2,3,N,CCDA,3,BD04,Slovak,F,NKDA,Winter,Afternoon,Mild,usual,well,96,28.0,140.0,37.0,None noted,N
3,4,N,GP,3,BD06,British,M,NKDA,Winter,Afternoon,Mild,usual,low concern,98,28.0,104.0,36.8,None noted,N
4,5,N,GP,3,BD09,Pakistani,M,NKA,Winter,Afternoon,Mild,usual,well,97,,,,None noted,N


In [6]:
# clean and convert cat features to categorical datatype
# (other than allergies - treated separately)

cat_features = ['referral_from', 'address', 'ethnicity', 'gender',
                'referral_date', 'referral_time', 'illness_severity',
                'activity_level', 'gut_feeling', 'sepsis', 'safeguarding']

for feature in cat_features:
    ace_data[feature] = (ace_data[feature].str.strip()
                                          .astype("category"))

In [7]:
# refactor allergy feature

for allergy in ["Food", "Drug", "Other"]:
    allergy_name = allergy.lower() + "_allergy"
    ace_data[allergy_name] = ace_data.allergies.apply(
        lambda x: 'Y' if allergy in x else 'N'
    ).astype("category")

# drop original allergy feature as no longer required
ace_data.drop("allergies", axis=1, inplace=True)

In [8]:
# fix numerical features with 'None'

float_features = ["ox_sat", "resp_rate", "heart_rate", "temp"]
for feature in float_features:
    ace_data[feature] = ace_data[feature].astype("float")

In [9]:
# set hospital_reqd to 1/0 for data analysis

ace_data["hospital_reqd"] = (ace_data.hospital_reqd == "Y").astype("int")

In [10]:
# new ethnicity features

# set reported ethnicity to other if not British / Pakistani
ace_data["simple_ethnicity"] = ace_data.ethnicity.apply(
    lambda x: x if x in ["Pakistani", "British"] else "other"
).replace(np.nan, "other")

ace_data["simple_ethnicity"] = ace_data.simple_ethnicity.astype("category")

# Group ethnicities into European / Asian / Other
asian = ["Indian", "Mixed Asian", "Pakistani", "white Asain", "British Asian",
         "Asian", "Sri Lankan", "Other Asian background", "Bangladeshi"]

european = ["Slovak", "British", "Other white background", "Czech Republic",
            "White Europeon", "White British", "CommonWealth Russian",
            "Other European", "Mixed White"]

ace_data["group_ethnicity"] = "other"
ace_data.loc[ace_data.ethnicity.isin(european), "group_ethnicity"] = "european"
ace_data.loc[ace_data.ethnicity.isin(asian), "group_ethnicity"] = "asian"

ace_data["group_ethnicity"] = ace_data.group_ethnicity.astype("category")

In [11]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   id                499 non-null    int64   
 1   hospital_reqd     499 non-null    int64   
 2   referral_from     498 non-null    category
 3   age               499 non-null    int64   
 4   address           499 non-null    category
 5   ethnicity         497 non-null    category
 6   gender            499 non-null    category
 7   referral_date     499 non-null    category
 8   referral_time     499 non-null    category
 9   illness_severity  497 non-null    category
 10  activity_level    496 non-null    category
 11  gut_feeling       494 non-null    category
 12  ox_sat            489 non-null    float64 
 13  resp_rate         488 non-null    float64 
 14  heart_rate        490 non-null    float64 
 15  temp              438 non-null    float64 
 16  sepsis            499 non-

Add features from ace referral sheets:

Criteria (relevant to dataset) are:
* ox_sat > 94
* heart_rate:
    * 2-5 yrs 95-140
    * 5-12 yrs 80-120
    * \> 12 yrs 60-100
* resp_rate:
    * 2-5 yrs 25-30
    * 5-12 yrs 20-25
    * \> 12 yrs 15-20

Other qualitative criteria (possibly relevant later):
    * auscultation - good air entry with some wheeze
    * speech - able to complete sentences
    * work of breathing - minimal / no recessions
    * conscious level - normal

*could illness_severity / gut_feeling be proxies for the above?*

In [12]:
# ox_sat_low feature: < 94 = low
ace_data["ox_sat_low"] = ace_data.ox_sat.apply(
    lambda x: "Y" if x < 94 else "N"
).astype("category")

In [13]:
# age_range feature: "pre_school" = (2-5), "primary" = (5-12), "secondary" (12+)

ace_data["age_range"] = "pre_school"

primary = (ace_data.age >= 5) & (ace_data.age < 12)
ace_data.loc[primary, "age_range"] = "primary"

secondary = ace_data.age >= 12
ace_data.loc[secondary, "age_range"] = "secondary"

ace_data["age_range"] = ace_data.age_range.astype("category")

In [14]:
# ace_heart_rate_cat feature: as per ace referral sheets

ace_data["ace_heart_rate_cat"] = "normal"

pre_school = ace_data.age_range == "pre_school"
pre_school_high = pre_school & (ace_data.heart_rate > 140)
ace_data.loc[pre_school_high, "ace_heart_rate_cat"] = "high"
pre_school_low = pre_school & (ace_data.heart_rate < 95)
ace_data.loc[pre_school_low, "ace_heart_rate_cat"] = "low"

primary_high = primary & (ace_data.heart_rate > 120)
ace_data.loc[primary_high, "ace_heart_rate_cat"] = "high"
primary_low = primary & (ace_data.heart_rate < 80)
ace_data.loc[primary_low, "ace_heart_rate_cat"] = "low"

secondary_high = secondary & (ace_data.heart_rate > 100)
ace_data.loc[secondary_high, "ace_heart_rate_cat"] = "high"
secondary_low = secondary & (ace_data.heart_rate < 60)
ace_data.loc[secondary_low, "ace_heart_rate_cat"] = "low"

ace_data["ace_heart_rate_cat"] = ace_data.ace_heart_rate_cat.astype("category")

In [15]:
# ace_resp_rate_cat feature: as per ace referral sheets

ace_data["ace_resp_rate_cat"] = "normal"

pre_school_high = pre_school & (ace_data.resp_rate > 30)
ace_data.loc[pre_school_high, "ace_resp_rate_cat"] = "high"
pre_school_low = pre_school & (ace_data.resp_rate < 25)
ace_data.loc[pre_school_low, "ace_resp_rate_cat"] = "low"

primary_high = primary & (ace_data.resp_rate > 25)
ace_data.loc[primary_high, "ace_resp_rate_cat"] = "high"
primary_low = primary & (ace_data.resp_rate < 20)
ace_data.loc[primary_low, "ace_resp_rate_cat"] = "low"

secondary_high = secondary & (ace_data.resp_rate > 20)
ace_data.loc[secondary_high, "ace_resp_rate_cat"] = "high"
secondary_low = secondary & (ace_data.resp_rate < 15)
ace_data.loc[secondary_low, "ace_resp_rate_cat"] = "low"

ace_data["ace_resp_rate_cat"] = ace_data.ace_resp_rate_cat.astype("category")


In [16]:
# meets_ace_criteria feature: as per referral sheets

ace_data["meets_ace_criteria"] = "N"

meets_ace_criteria = ((ace_data.ox_sat_low == "N") &
                      (ace_data.ace_heart_rate_cat == "normal") &
                      (ace_data.ace_resp_rate_cat == "normal") &
                      (ace_data.gut_feeling != "unwell") &
                      (ace_data.illness_severity != "Moderate"))

ace_data.loc[meets_ace_criteria, "meets_ace_criteria"] = "Y"
ace_data["meets_ace_criteria"] = ace_data.meets_ace_criteria.astype("category")

In [17]:
# apls_heart_rate_cat feature: as per physiological parameters guide

ace_data["apls_heart_rate_cat"] = "normal"

def set_apls_heart_rate(age_range, hr_range):
    age_mask = ((ace_data.age >= age_range[0]) &
                (ace_data.age < age_range[1]))
    low_mask = age_mask & (ace_data.heart_rate < hr_range[0])
    high_mask = age_mask & (ace_data.heart_rate > hr_range[1])
    ace_data.loc[low_mask, "apls_heart_rate_cat"] = "low"
    ace_data.loc[high_mask, "apls_heart_rate_cat"] = "high"

set_apls_heart_rate(age_range=(0,2), hr_range=(100, 160))
set_apls_heart_rate(age_range=(2,3), hr_range=(100, 150))
set_apls_heart_rate(age_range=(3,4), hr_range=(90, 140))
set_apls_heart_rate(age_range=(4,6), hr_range=(80, 135))
set_apls_heart_rate(age_range=(6,8), hr_range=(80, 130))
set_apls_heart_rate(age_range=(8,12), hr_range=(70, 120))
set_apls_heart_rate(age_range=(12,14), hr_range=(65, 115))
set_apls_heart_rate(age_range=(14,18), hr_range=(60, 110))

ace_data["apls_heart_rate_cat"] = ace_data.apls_heart_rate_cat.astype("category")

In [18]:
# apls_resp_rate_cat feature: as per physiological parameters guide

ace_data["apls_resp_rate_cat"] = "normal"

def set_apls_resp_rate(age_range, rr_range):
    age_mask = ((ace_data.age >= age_range[0]) &
                (ace_data.age < age_range[1]))
    low_mask = age_mask & (ace_data.resp_rate < rr_range[0])
    high_mask = age_mask & (ace_data.resp_rate > rr_range[1])
    ace_data.loc[low_mask, "apls_resp_rate_cat"] = "low"
    ace_data.loc[high_mask, "apls_resp_rate_cat"] = "high"

set_apls_resp_rate(age_range=(0,2), rr_range=(20, 40))
set_apls_resp_rate(age_range=(2,8), rr_range=(20, 30))
set_apls_resp_rate(age_range=(8,12), rr_range=(15, 25))
set_apls_resp_rate(age_range=(12,18), rr_range=(12, 24))

ace_data["apls_resp_rate_cat"] = ace_data.apls_resp_rate_cat.astype("category")

In [19]:
# pickle data
ace_data.to_pickle("../data/ace_data_prepped.pkl")
ace_data.to_csv("../data/ace_data_prepped.csv")

In [20]:
ace_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   id                   499 non-null    int64   
 1   hospital_reqd        499 non-null    int64   
 2   referral_from        498 non-null    category
 3   age                  499 non-null    int64   
 4   address              499 non-null    category
 5   ethnicity            497 non-null    category
 6   gender               499 non-null    category
 7   referral_date        499 non-null    category
 8   referral_time        499 non-null    category
 9   illness_severity     497 non-null    category
 10  activity_level       496 non-null    category
 11  gut_feeling          494 non-null    category
 12  ox_sat               489 non-null    float64 
 13  resp_rate            488 non-null    float64 
 14  heart_rate           490 non-null    float64 
 15  temp                 43