In [1]:
rm(list = ls())
setwd("/mnt/c/git_repos/iORD_hepatitis/")
require(tidyverse)
require(data.table)
require(lubridate)
require(foreach)
require(doParallel)
require(funtimes)
require(tidyquant)

Loading required package: tidyverse

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Load

### Load data

In [2]:
df <- fread("data/IORD_ASHep-UE_34_20230120_LIMS.csv") %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate),
           CollectionDateTime = as.Date(CollectionDateTime)) %>%
    filter(AttendanceStartDate < as.Date("2023-01-01"))

age_df <- fread("data/IORD_ASHep-UE_34_20230120_Attendances.csv") %>%
    filter(AttendanceStartDate < as.Date("2023-01-01")) %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate),
           AttendanceEndDate = as.Date(AttendanceEndDate),
           LinkedBirthmonth = as.Date(LinkedBirthmonth)) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth),
           AttendanceTwoMonth = as.Date(AttendanceTwoMonth)) %>%
    mutate(age_upon_presentation = time_length(AttendanceStartDate -
                                               LinkedBirthmonth, "year")) %>%
    mutate(age_upon_presentation = floor(age_upon_presentation)) %>% 
    mutate(age_group = case_when(age_upon_presentation < 7 ~ "<7",
                                 age_upon_presentation >= 6 & age_upon_presentation <= 15 ~ "7-15",
                                 age_upon_presentation >= 16 ~ ">15")) %>%
    arrange(AttendanceStartDate)

month_df <- df %>%
    mutate(CollectionMonth = cut(CollectionDateTime, breaks = "month")) %>%
    mutate(CollectionTwoMonths = cut(CollectionDateTime, breaks = "2 months")) %>%
    mutate(CollectionMonth = as.Date(CollectionMonth),
           CollectionTwoMonths = as.Date(CollectionTwoMonths)) %>%
    left_join(age_df) %>%
    arrange(AttendanceStartDate)

admission_df <- fread("data/IORD_ASHep-UE_34_20230120_InpatientSpells.csv") %>%
    filter(AttendanceStartDate < as.Date("2023-01-01")) %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate))

test_df <- fread("data/IORD_ASHep-UE_34_20230120_LIMS.csv") %>%
    mutate(CollectionDateTime = as.Date(CollectionDateTime),
           AttendanceStartDate = as.Date(AttendanceStartDate)) %>%
    filter(AttendanceStartDate < as.Date("2023-01-01"))

test_parsed <- test_df %>%
    distinct(ClusterID, AttendanceStartDate, TestName, .keep_all = T) %>%
    separate(RefRange, into = c("LLN", "ULN"), sep = "-") %>%
    mutate(ULN = as.numeric(ULN)) %>%
    select(-CollectionDateTime, -ReceiveDateTime) %>%
    mutate(Value = as.numeric(Value)) %>%
    mutate(Value = ifelse(is.na(Value), 0, Value)) %>%
    mutate(value_class = case_when(Value <= ULN ~ "Normal <= ULN",
                                 Value > ULN & Value <= 2 * ULN ~ "Mild (1-2x ULN)",
                                 Value > 2 * ULN & Value <= 5 * ULN  ~ "Moderate (2-5x ULN)",
                                 Value > 5 * ULN ~ "Severe (>5x ULN)")) %>%
    mutate(value_class = factor(value_class, c("Normal <= ULN", 
                                               "Mild (1-2x ULN)", 
                                               "Moderate (2-5x ULN)",
                                               "Severe (>5x ULN)"))) 

diag_df <- fread("data/IORD_ASHep-UE_34_20230120_DiagnosisCodes.csv") %>%
    left_join(fread("data/IORD_ASHep-UE_34_20230120_InpatientSpells.csv")) %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate)) %>%
    left_join(age_df) %>%
    mutate(age_upon_presentation = floor(age_upon_presentation)) %>% 
    mutate(age_group = case_when(age_upon_presentation < 7 ~ "<7",
                                 age_upon_presentation >= 6 & age_upon_presentation <= 15 ~ "7-15",
                                 age_upon_presentation >= 16 ~ ">15")) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth),
           AttendanceTwoMonth = as.Date(AttendanceTwoMonth)) %>%
    arrange(AttendanceMonth) %>%
    distinct(ClusterID, AttendanceStartDate, DiagCode)

[1m[22mJoining with `by = join_by(ClusterID, AttendanceStartDate)`
“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 42 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 170136 of `y` matches multiple rows in `x`.
[1m[22m[36mℹ[39m In argument: `ULN = as.numeric(ULN)`.
[33m![39m NAs introduced by coercion”
[1m[22m[36mℹ[39m In argument: `Value = as.numeric(Value)`.
[33m![39m NAs introduced by coercion”
[1m[22mJoining with `by = join_by(ClusterID, SpellID)`
“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 148 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 131747 of `y` matches multiple rows in `x`.
[1m[22mJoining with `by = join_by(ClusterID, AttendanceStartDate)`
“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 171 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 125173 of `y` matches multiple rows in `x`.


### Dataset range

In [3]:
total_number <- age_df %>%
    distinct(ClusterID, AttendanceStartDate) %>%
    nrow()

total_number

# Sex breakdown
count_df <- age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    group_by(age_group) %>%
    summarise(n_total = n())

age_df %>%
    distinct(ClusterID, AttendanceStartDate, LinkedSex) %>%
    group_by(LinkedSex) %>%
    summarise(n = n())

age_df %>%
    distinct(ClusterID, AttendanceStartDate, LinkedSex, .keep_all = T) %>%
    group_by(LinkedSex, age_group) %>%
    summarise(n = n()) %>%
    left_join(count_df) %>%
    mutate(prop = round(n / n_total * 100, 2))

# Age summary stats
age_df %>%
    distinct(ClusterID, AttendanceStartDate, age_upon_presentation) %>%
    summarise(median = median(age_upon_presentation),
              low_IQR = quantile(age_upon_presentation, prob = c(0.25)),
              high_IQR = quantile(age_upon_presentation, prob = c(0.75)))

age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    group_by(age_group) %>%
    summarise(median = median(age_upon_presentation),
              low_IQR = quantile(age_upon_presentation, prob = c(0.25)),
              high_IQR = quantile(age_upon_presentation, prob = c(0.75)))

# Ethnicity 
# age_df %>%
#     distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
#     group_by(EthnicGroupCode) %>%
#     summarise(n = n())

# IMD score
age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    mutate(IMDScore = as.numeric(IMDScore)) %>%
    filter(!is.na(IMDScore)) %>%
    group_by(age_group) %>%
    summarise(median_IMD2 = quantile(IMDScore, prob = c(0.2)),
             median_IMD4 = quantile(IMDScore, prob = c(0.4)),
             median_IMD6 = quantile(IMDScore, prob = c(0.6)),
             median_IMD8 = quantile(IMDScore, prob = c(0.8)))

# Attendance per month
age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    group_by(AttendanceMonth) %>%
    summarise(n_admit = n()) %>%
    summarise(median_admit = median(n_admit))

LinkedSex,n
<chr>,<int>
F,461632
I,16
M,441780
U,5


[1m[22m`summarise()` has grouped output by 'LinkedSex'. You can override using the `.groups` argument.
[1m[22mJoining with `by = join_by(age_group)`


LinkedSex,age_group,n,n_total,prop
<chr>,<chr>,<int>,<int>,<dbl>
F,7-15,39498,86217,45.81
F,<7,30446,70962,42.9
F,>15,391688,746254,52.49
I,7-15,4,86217,0.0
I,>15,12,746254,0.0
M,7-15,46715,86217,54.18
M,<7,40516,70962,57.1
M,>15,354549,746254,47.51
U,>15,5,746254,0.0


median,low_IQR,high_IQR
<dbl>,<dbl>,<dbl>
44,22,69


age_group,median,low_IQR,high_IQR
<chr>,<dbl>,<dbl>,<dbl>
7-15,11,9,13
<7,3,2,5
>15,53,32,74


[1m[22m[36mℹ[39m In argument: `IMDScore = as.numeric(IMDScore)`.
[33m![39m NAs introduced by coercion”


age_group,median_IMD2,median_IMD4,median_IMD6,median_IMD8
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
7-15,5.792,8.982,13.215,20.481
<7,5.904,9.156,13.619,21.4902
>15,6.061,9.193,13.444,20.241


median_admit
<dbl>
11023


In [4]:
age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    group_by(age_group) %>%
    summarise(n = n()) %>% 
    mutate(n / sum(n))

age_group,n,n/sum(n)
<chr>,<int>,<dbl>
7-15,86217,0.09543264
<7,70962,0.07854705
>15,746254,0.8260203


### Admissions

In [6]:
admit_attendance_df <- age_df %>%
    left_join(admission_df) %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    mutate(LinkedDeathdate = ifelse(LinkedDeathdate == "NULL", "2030-01-14 14:00:08.433", LinkedDeathdate),
           duration = as.Date(DischargeDate) - as.Date(AdmissionDate)) %>%
    mutate(AttendanceEndDate = ifelse(is.na(AttendanceEndDate), as.Date(DischargeDate), as.Date(AttendanceEndDate))) %>%
    mutate(LinkedDeathdate = as.Date(LinkedDeathdate)) %>%
    mutate(is_admitted = ifelse(is.na(AdmissionDate), F, T),
           is_icu = ifelse(SpellIncludesCriticalCareAdmission == 1, T, F),
           is_death = ifelse(LinkedDeathdate >= AttendanceStartDate &
                             LinkedDeathdate <= AttendanceEndDate, T, F))
#     mutate(is_icu = ifelse(is.na(is_icu), F, T))

# Admission duration
admit_attendance_df %>%
    filter(!is.na(AdmissionDate),
           DischargeDate != "NULL") %>%
    group_by(age_group) %>%
    summarise(median_duration = median(duration),
              duration_low = quantile(duration, probs = c(0.25)),
              duration_high = quantile(duration, probs = c(0.75)))
# Admission, ICU, death rate by age
admit_attendance_df %>%
    group_by(age_group) %>%
    summarise(admit_rate = round(sum(is_admitted, na.rm = T) / n() * 100, 1),
              icu_rate = round(sum(is_icu, na.rm = T) / n() * 100, 1),
              death_rate = round(sum(is_death, na.rm = T) / n() * 100, 2))
# Overall Admission, ICU, death rate
admit_attendance_df %>%
    summarise(admit_rate = round(sum(is_admitted, na.rm = T) / n() * 100, 1),
              icu_rate = round(sum(is_icu, na.rm = T) / n() * 100, 1),
              death_rate = round(sum(is_death, na.rm = T) / n() * 100, 2))

[1m[22mJoining with `by = join_by(ClusterID, AttendanceStartDate)`
“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 18 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 140318 of `y` matches multiple rows in `x`.


age_group,median_duration,duration_low,duration_high
<chr>,<drtn>,<drtn>,<drtn>
7-15,1 days,1 days,2 days
<7,1 days,0 days,2 days
>15,2 days,1 days,7 days


age_group,admit_rate,icu_rate,death_rate
<chr>,<dbl>,<dbl>,<dbl>
7-15,13.2,0.3,0.01
<7,17.0,0.4,0.01
>15,34.3,1.5,0.41


admit_rate,icu_rate,death_rate
<dbl>,<dbl>,<dbl>
30.9,1.3,0.34


### Biomarker tests

In [7]:
test_names <- c("ALT", "AST", "ALK.PHOSPHATASE", "BILIRUBIN", "ALBUMIN", "GGT",
               "CRP", "WHITE CELLS")
tests <- admit_attendance_df %>%
    right_join(test_parsed) %>%
    filter(TestName %in% test_names)

[1m[22mJoining with `by = join_by(ClusterID, AttendanceStartDate)`


In [8]:
test_summary <- admit_attendance_df %>%
    left_join(test_parsed) %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n_tests = n_distinct(TestName, na.rm = T),
              n_ALT = sum(TestName == "ALT", na.rm = T),
              n_AST = sum(TestName == "AST", na.rm = T)) %>%
    ungroup()

# Prop. of patients tested
test_summary %>%
    summarise(sum(n_tests > 0) / nrow(test_summary))

n_pat_with_tests <- deframe(test_summary %>%
    summarise(sum(n_tests > 0)))

# Prop. with ALT test
test_summary %>%
    summarise(sum(n_ALT > 0) / n_pat_with_tests)

# Prop. with AST test
test_summary %>%
    summarise(sum(n_AST > 0) / n_pat_with_tests)

[1m[22mJoining with `by = join_by(ClusterID, AttendanceStartDate)`
[1m[22m`summarise()` has grouped output by 'ClusterID'. You can override using the `.groups` argument.


sum(n_tests > 0)/nrow(test_summary)
<dbl>
0.5880912


sum(n_ALT > 0)/n_pat_with_tests
<dbl>
0.9034954


sum(n_AST > 0)/n_pat_with_tests
<dbl>
0.0117504


In [10]:
test_df %>% distinct(TestName, RefRange, Units)

TestName,RefRange,Units
<chr>,<chr>,<chr>
ALBUMIN,32-50,g/L
CRP,0-5,mg/L
EOSINOPHILS,0.0-0.5,x10*9/L
WHITE CELLS,4.0-11.0,x10*9/L
PLATELETS,150-400,x10*9/L
ALK.PHOSPHATASE,30-130,IU/L
ALT,10-45,IU/L
MONOCYTES,0.2-1.0,x10*9/L
NEUTROPHILS,2.0-7.0,x10*9/L
BILIRUBIN,0-21,umol/L


In [9]:
for (test in test_names) {
    stat_temp <- tests %>%
        mutate(age_group = factor(age_group, c("<7", "7-15", ">15"))) %>%
        filter(TestName == test) %>%
        group_by(age_group) %>%
        summarise(median_test = median(Value),
                  test_low = quantile(Value, probs = c(0.25)),
                  test_high = quantile(Value, probs = c(0.75)))
    print(test)
    print(stat_temp)
}

[1] "ALT"
[90m# A tibble: 3 × 4[39m
  age_group median_test test_low test_high
  [3m[90m<fct>[39m[23m           [3m[90m<dbl>[39m[23m    [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m
[90m1[39m <7                 15       12        21
[90m2[39m 7-15               14       11        20
[90m3[39m >15                19       13        30
[1] "AST"
[90m# A tibble: 3 × 4[39m
  age_group median_test test_low test_high
  [3m[90m<fct>[39m[23m           [3m[90m<dbl>[39m[23m    [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m
[90m1[39m <7                 41       31      144.
[90m2[39m 7-15               33       21      119 
[90m3[39m >15                41       22      110.
[1] "ALK.PHOSPHATASE"
[90m# A tibble: 3 × 4[39m
  age_group median_test test_low test_high
  [3m[90m<fct>[39m[23m           [3m[90m<dbl>[39m[23m    [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m
[90m1[39m <7                195      160       237
[90m2[39m 7-15 

In [11]:
for (test in test_names) {
    stat_temp <- tests %>%
        filter(adeno_pos) %>%
        mutate(age_group = factor(age_group, c("<7", "7-15", ">15"))) %>%
        filter(TestName == test) %>%
        group_by(age_group) %>%
        summarise(median_test = median(Value),
                  test_low = quantile(Value, probs = c(0.25)),
                  test_high = quantile(Value, probs = c(0.75)))
    print(test)
    print(stat_temp)
}

ERROR: [1m[33mError[39m in `filter()`:[22m
[1m[22m[36mℹ[39m In argument: `adeno_pos`.
[1mCaused by error:[22m
[33m![39m object 'adeno_pos' not found


### Diagnostic codes

In [None]:
age_df %>%
    filter(ClusterID == 2561819)

In [None]:
total_count <- deframe(diag_df %>% 
                       distinct(ClusterID, AttendanceStartDate) %>%
                       nrow())
diag_df %>%
    left_join(age_df) %>%
    filter(!is.na(age_group)) %>%
    mutate(age_group = factor(age_group, c("<7", "7-15", ">15"))) %>%
    group_by(DiagCode, age_group) %>%
    summarise(n = n()) 
