In [1]:
# rm(list = ls())
setwd("/mnt/c/git_repos/iORD_hepatitis/")
require(tidyverse)
require(data.table)
require(lubridate)
require(foreach)
require(doParallel)
require(randomcoloR)
require(ggpubr)


Loading required package: tidyverse

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Load

### Load data

In [2]:
age_df <- fread("data/IORD_ASHep-UE_34_20230120_Attendances.csv") %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate),
           AttendanceEndDate = as.Date(AttendanceEndDate),
           LinkedBirthmonth = as.Date(LinkedBirthmonth)) %>%
    mutate(age_upon_presentation = time_length(AttendanceStartDate -
                                               LinkedBirthmonth, "year")) %>%
    mutate(age_upon_presentation = floor(age_upon_presentation)) %>% 
    mutate(age_group = case_when(age_upon_presentation < 7 ~ "<7",
                                 age_upon_presentation >= 6 & age_upon_presentation <= 15 ~ "7-15",
                                 age_upon_presentation >= 16 ~ ">15")) %>%
    filter(AttendanceStartDate < as.Date("2023-01-01")) %>%
    arrange(AttendanceStartDate)

micro_df <- fread("data/IORD_ASHep-UE_34_20230120_Microbiology.csv") %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate)) %>%
    left_join(age_df) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate),
           AttendanceMonth = as.Date(AttendanceMonth),
           AttendanceTwoMonth = as.Date(AttendanceTwoMonth)) %>%
    filter(AttendanceMonth < as.Date("2023-01-01")) %>%
    rename(MicroTestName = TestName) %>%
    # Parse records
    mutate(BugName = ifelse(BatTestCode == "QADN" & ResultFull == "DETECTED", "Adenovirus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Adenovirus", BugName, ignore.case = T), "Adenovirus", BugName)) %>%
    mutate(BugName = ifelse(BatTestCode %in% c('CV2G', 'CV2G2', 'CV2P', 'CV2Q', 'CV2V', 'RCV2Q', 'RCV2V') &
                            !grepl("not detected| |NONE|indeterminate", ResultFull, ignore.case = T),
                            "SARS-CoV-2", BugName)) %>%
    mutate(BugName = ifelse(grepl("SARS Coronavirus-2", BugName, ignore.case = T), "SARS-CoV-2", BugName)) %>%
    mutate(BugName = ifelse(grepl("Staphy", BugName, ignore.case = T), "Staphylococcus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Strep", BugName, ignore.case = T), "Streptococcus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Candida", BugName, ignore.case = T), "Candida", BugName)) %>%
    mutate(BugName = ifelse(grepl("influenza a|influenza b|flu a|flu b", BugName, ignore.case = T), "Influenza", BugName)) %>%
    mutate(BugName = ifelse(grepl("Escherichia|E. coli", BugName, ignore.case = T), "Escherichia", BugName)) %>%
    mutate(BugName = ifelse(grepl("Enterococcus", BugName, ignore.case = T), "Enterococcus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Klebsiella", BugName, ignore.case = T), "Klebsiella", BugName)) %>%
    mutate(BugName = ifelse(grepl("Acinetobacter", BugName, ignore.case = T), "Acinetobacter", BugName)) %>%
    mutate(BugName = ifelse(grepl("Bacillus", BugName, ignore.case = T), "Bacillus", BugName)) %>%
    mutate(BugName = ifelse(grepl("HAEMOPHILUS", BugName, ignore.case = T), "Haemophilus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Parvimonas", BugName, ignore.case = T), "Parvimonas", BugName)) %>%
    mutate(BugName = ifelse(grepl("Burkholderia", BugName, ignore.case = T), "Burkholderia", BugName)) %>%
    mutate(BugName = ifelse(grepl("Micrococcus", BugName, ignore.case = T), "Micrococcus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Proteus", BugName, ignore.case = T), "Proteus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Lactococcus", BugName, ignore.case = T), "Lactococcus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Rothia", BugName, ignore.case = T), "Rothia", BugName)) %>%
    mutate(BugName = ifelse(grepl("Salmonella", BugName, ignore.case = T), "Salmonella", BugName)) %>%
    mutate(BugName = ifelse(grepl("Actinomyces", BugName, ignore.case = T), "Actinomyces", BugName)) %>%
    mutate(BugName = ifelse(grepl("ANAEROCOCCUS", BugName, ignore.case = T), "Anaerococcus ", BugName)) %>%
    mutate(BugName = ifelse(grepl("Pseudomonas", BugName, ignore.case = T), "Pseudomonas", BugName)) %>%
    mutate(BugName = ifelse(grepl("Corynebacterium", BugName, ignore.case = T), "Corynebacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Yersinia", BugName, ignore.case = T), "Yersinia", BugName)) %>%
    mutate(BugName = ifelse(grepl("ASPERGILLUS", BugName, ignore.case = T), "Aspergillus", BugName)) %>%
    mutate(BugName = ifelse(grepl("TURICELLA", BugName, ignore.case = T), "Turicella", BugName)) %>%
    mutate(BugName = ifelse(grepl("serratia", BugName, ignore.case = T), "Serratia", BugName)) %>%
    mutate(BugName = ifelse(grepl("moraxella", BugName, ignore.case = T), "Moraxella", BugName)) %>%
    mutate(BugName = ifelse(grepl("bacteroides", BugName, ignore.case = T), "Bacteroides", BugName)) %>%
    mutate(BugName = ifelse(grepl("shigella", BugName, ignore.case = T), "Shigella", BugName)) %>%
    mutate(BugName = ifelse(grepl("enterobacter", BugName, ignore.case = T), "Enterobacter", BugName)) %>%
    mutate(BugName = ifelse(grepl("GORDONIA", BugName, ignore.case = T), "Gordonia", BugName)) %>%
    mutate(BugName = ifelse(grepl("rhinovirus|enterovirus", BugName, ignore.case = T), "Rhinovirus/Enterovirus", BugName)) %>%
    mutate(BugName = ifelse(grepl("aerococcus", BugName, ignore.case = T), "Aerococcus", BugName)) %>%
    mutate(BugName = ifelse(grepl("aeromonas", BugName, ignore.case = T), "Aeromonas", BugName)) %>%
    mutate(BugName = ifelse(grepl("Raoultella", BugName, ignore.case = T), "Raoultella", BugName)) %>%
    mutate(BugName = ifelse(grepl("Brevibacterium", BugName, ignore.case = T), "Brevibacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Prevotella", BugName, ignore.case = T), "Prevotella", BugName)) %>%
    mutate(BugName = ifelse(grepl("Trichophyton", BugName, ignore.case = T), "Trichophyton", BugName)) %>%
    mutate(BugName = ifelse(grepl("Parainfluenzavirus|parainfluenza virus", BugName, ignore.case = T), "Parainfluenzavirus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Clostridium", BugName, ignore.case = T), "Clostridium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Neisseria", BugName, ignore.case = T), "Neisseria", BugName)) %>%
    mutate(BugName = ifelse(grepl("Fusobacterium", BugName, ignore.case = T), "Fusobacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Atopobium", BugName, ignore.case = T), "Atopobium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Aggregatibacter", BugName, ignore.case = T), "Aggregatibacter", BugName)) %>%
    mutate(BugName = ifelse(grepl("Sphingobacterium", BugName, ignore.case = T), "Sphingobacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Bradyrhizobium", BugName, ignore.case = T), "Bradyrhizobium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Ruminococcus", BugName, ignore.case = T), "Ruminococcus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Alcaligenes", BugName, ignore.case = T), "Alcaligenes", BugName)) %>%
    mutate(BugName = ifelse(grepl("Paracoccus", BugName, ignore.case = T), "Paracoccus", BugName)) %>%
    mutate(BugName = ifelse(grepl("HAEMATOBACTER", BugName, ignore.case = T), "Haematobacter", BugName)) %>%
    mutate(BugName = ifelse(grepl("Microbacterium", BugName, ignore.case = T), "Microbacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Mycobacterium", BugName, ignore.case = T), "Mycobacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Citrobacter", BugName, ignore.case = T), "Citrobacter", BugName)) %>%
    mutate(BugName = ifelse(grepl("Propionibacterium", BugName, ignore.case = T), "Cutibacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Respiratory Syncytial Virus", BugName, ignore.case = T), "RSV", BugName)) %>%
    mutate(BugName = ifelse(grepl("Herpes|varicella", BugName, ignore.case = T), "Herpesvirus", BugName)) %>%
    mutate(BugName = ifelse(grepl("ACHROMOBACTER", BugName, ignore.case = T), "Achromobacter", BugName)) %>%
    mutate(BugName = ifelse(grepl("Kocuria", BugName, ignore.case = T), "Kocuria", BugName)) %>%
    mutate(BugName = ifelse(grepl("Roseomonas", BugName, ignore.case = T), "Roseomonas", BugName)) %>%
    mutate(BugName = ifelse(grepl("Morganella", BugName, ignore.case = T), "Morganella", BugName)) %>%
    mutate(BugName = ifelse(grepl("Listeria", BugName, ignore.case = T), "Listeria", BugName)) %>%
    mutate(BugName = ifelse(grepl("Chlamydia", BugName, ignore.case = T), "Cutibacterium", BugName)) %>%
    mutate(BugName = ifelse(grepl("Stenotrophomonas", BugName, ignore.case = T), "Stenotrophomonas", BugName)) %>%
    mutate(BugName = ifelse(grepl("Sphingomonas", BugName, ignore.case = T), "Sphingomonas", BugName)) %>%
    mutate(BugName = ifelse(grepl("VEILLONELLA", BugName, ignore.case = T), "Veillonella", BugName)) %>%
    mutate(BugName = ifelse(grepl("Finegoldia", BugName, ignore.case = T), "Finegoldia", BugName)) %>%
    mutate(BugName = ifelse(grepl("Pasteurella", BugName, ignore.case = T), "Pasteurella", BugName)) %>%
    mutate(BugName = ifelse(grepl("Campylobacter", BugName, ignore.case = T), "Campylobacter", BugName)) %>%
    mutate(BugName = ifelse(grepl("Rhizobium", BugName, ignore.case = T), "Rhizobium", BugName)) %>%
    mutate(BugName = ifelse(grepl("PEPTONIPHILUS", BugName, ignore.case = T), "Peptoniphilus", BugName)) %>%
    mutate(BugName = ifelse(grepl("Brevundimonas", BugName, ignore.case = T), "Brevundimonas", BugName)) %>%
    mutate(BugName = ifelse(grepl("GEMELLA", BugName, ignore.case = T), "Gemella", BugName)) %>%
    mutate(BugName = ifelse(grepl("cocci|colonies|aerobic|anaerobic|anaerobes|gram positive|gram negative|Fungal", BugName, ignore.case = T), "ambiguous", BugName))

test_df <- fread("data/IORD_ASHep-UE_34_20230120_LIMS.csv") %>%
    mutate(CollectionDateTime = as.Date(CollectionDateTime),
           AttendanceStartDate = as.Date(AttendanceStartDate)) %>%
    filter(AttendanceStartDate < as.Date("2023-01-01"))

test_parsed <- test_df %>%
    distinct(ClusterID, AttendanceStartDate, TestName, .keep_all = T) %>%
    separate(RefRange, into = c("LLN", "ULN"), sep = "-") %>%
    mutate(ULN = as.numeric(ULN)) %>%
    select(-CollectionDateTime, -ReceiveDateTime) %>%
    mutate(Value = as.numeric(Value)) %>%
    mutate(Value = ifelse(is.na(Value), 0, Value)) %>%
    mutate(value_class = case_when(Value <= ULN ~ "Normal <= ULN",
                                 Value > ULN & Value <= 2 * ULN ~ "Mild (1-2x ULN)",
                                 Value > 2 * ULN & Value <= 5 * ULN  ~ "Moderate (2-5x ULN)",
                                 Value > 5 * ULN ~ "Severe (>5x ULN)")) %>%
    mutate(value_class = factor(value_class, c("Normal <= ULN", 
                                               "Mild (1-2x ULN)", 
                                               "Moderate (2-5x ULN)",
                                               "Severe (>5x ULN)"))) 

diag_df <- fread("data/IORD_ASHep-UE_34_20230120_DiagnosisCodes.csv") %>%
    left_join(fread("data/IORD_ASHep-UE_34_20230120_InpatientSpells.csv")) %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate)) %>%
    left_join(age_df) %>%
    mutate(age_upon_presentation = floor(age_upon_presentation)) %>% 
    mutate(age_group = case_when(age_upon_presentation < 7 ~ "<7",
                                 age_upon_presentation >= 6 & age_upon_presentation <= 15 ~ "7-15",
                                 age_upon_presentation >= 16 ~ ">15")) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth),
           AttendanceTwoMonth = as.Date(AttendanceTwoMonth)) %>%
    arrange(AttendanceMonth) %>%
    filter(AttendanceStartDate < as.Date("2023-01-01"))

spell_df <- fread("data/IORD_ASHep-UE_34_20230120_InpatientSpells.csv") %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate)) %>%
    filter(AttendanceStartDate < as.Date("2023-01-01"))

vital_df <- fread("data/IORD_ASHep-UE_34_20230120_VitalSigns.csv") %>%
    distinct(ClusterID, AttendanceStartDate, EventName, .keep_all = T) %>%
    mutate(AttendanceStartDate = as.Date(AttendanceStartDate)) %>%
    filter(AttendanceStartDate < as.Date("2023-01-01"))

# all_months <- cut(seq(as.Date("2016-03-01"), as.Date("2022-12-01"), by = "1 month"), breaks = "1 month")
# all_months_df <- tibble(AttendanceMonth = rep(as.Date(all_months),3),
#                         age_group = c(rep("<7", length(all_months)),
#                                       rep("7-15", length(all_months)),
#                                       rep(">15", length(all_months))))

[1m[22mJoining with `by = join_by(ClusterID, AttendanceStartDate)`
“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 93 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 659231 of `y` matches multiple rows in `x`.


ERROR: Error in .shallow(x, cols = cols, retain.key = TRUE): attempt to set index 0/0 in SET_VECTOR_ELT


In [None]:
micro_df2 <- fread("data/IORD_ASHep-UE_34_20230120_Microbiology.csv")

In [None]:
micro_df %>%
    filter(BatTestCode %in% c('CV2G', 'CV2G2', 'CV2P', 'CV2Q', 'CV2V', 'RCV2Q', 'RCV2V')) %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    group_by(AttendanceMonth) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = AttendanceMonth, y = n)) +
    geom_point() +
    geom_line() +
    scale_x_date(date_labels="%b-%y", date_breaks  = "3 month") +
    labs(x = "Attendance Month", y =  "No. of COVID test results")



In [None]:
micro_df %>%
#     filter(MicroTestName == "ADENOVIRUS DNA") %>%
#     filter(AttendanceStartDate > as.Date("2020-04-01")) %>%
    filter(BugName == "Adenovirus") %>%
    distinct(SpecimenFull, ResultFull)

In [None]:
## For building viral hep regex
# micro_df %>%
#     filter(grepl("Hepatitis|Hep|HBV", MicroTestName, ignore.case = T)) %>%
# #     distinct(MicroTestName, ResultFull) %>%
#     mutate(ResultFull = str_trim(ResultFull)) %>%
#     filter(MicroTestName != "HEPATITIS C GENOTYPE") %>%
#     filter(!grepl("potential identifiers|see result|sent to reference lab|see comment|Not detected|sorry|insufficient|do not report|EQUIVOCAL|UNSUITABLE|UNCONFIRMED|Not tested|(CDR)|further testing required", ResultFull, ignore.case = T)) %>%
#     filter(!(ResultFull %in% c("", " "))) %>%
#     mutate(is_viral_hep = T) %>%
#     distinct(ClusterID, AttendanceStartDate, is_viral_hep)


In [None]:
age_df %>%
    distinct(ClusterID, AttendanceStartDate) %>%
    left_join(test_df %>%
        distinct(ClusterID, AttendanceStartDate, TestName, .keep_all = T)) %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n_ALT = sum(TestName == "ALT", na.rm = T),
              n_AST = sum(TestName == "AST", na.rm = T),
              n_tests = n_distinct(TestName, na.rm = T)) %>%
    mutate(test = ifelse(n_tests > 0, T, F),
           ALT_test = ifelse(n_ALT > 0, T, F),
           AST_test = ifelse(n_AST > 0, T, F)) %>%
    ungroup() %>%
    summarise(prop_test = sum(test) / n(),
              prop_ALT = sum(ALT_test) / n(),
              prop_AST = sum(AST_test) / n())


### Filter data

In [None]:
# Group ICD10 codes with unspecified agent
unspecified <- c("K720", "K759", "K716", "K752", 
                 "B199", "B179", "B178")

diag_filt <- diag_df %>% 
    mutate(unknown = ifelse(DiagCode %in% unspecified, 1, 0),
           known = ifelse(!(DiagCode %in% unspecified), 1, 0)) %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n_unknown = sum(unknown),
              n_known = sum(known)) %>%
    mutate(only_unknown = n_unknown > 0 & n_known == 0) %>%
    select(-n_unknown, -n_known) %>%
    ungroup()

test_filt <- test_parsed %>%
    filter(TestName == "ALT") %>%
    mutate(is_acute_hep = ifelse(Value > 2 * ULN, T, F)) %>%
    select(ClusterID, AttendanceStartDate, is_acute_hep)

micro_filt <- micro_df %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n_bug = sum(BugName != "NULL")) %>%
    mutate(micro_pos = ifelse(n_bug > 0, T, F)) %>%
    ungroup()

adeno_tests <- micro_df %>%
    filter(grepl("adeno", BugName, ignore.case = T)) %>%
    distinct(BatTestCode)
adeno_tests <- c("QADN", adeno_tests$BatTestCode)

adeno_filt <- micro_df %>%
    filter(BatTestCode %in% adeno_tests) %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n_adeno = sum(grepl("adenovirus", BugName, ignore.case = T))) %>%
    mutate(adeno_pos = ifelse(n_adeno > 0, T, F)) %>%
    ungroup()

viral_hep_filt <- micro_df %>%
    filter(grepl("Hepatitis|Hep|HBV", MicroTestName, ignore.case = T)) %>%
    mutate(ResultFull = str_trim(ResultFull)) %>%
    filter(MicroTestName != "HEPATITIS C GENOTYPE") %>%
    filter(!grepl("potential identifiers|see result|sent to reference lab|see comment|Not detected|sorry|insufficient|do not report|EQUIVOCAL|UNSUITABLE|UNCONFIRMED|Not tested|(CDR)|further testing required", ResultFull, ignore.case = T)) %>%
    filter(!(ResultFull %in% c("", " "))) %>%
    mutate(is_viral_hep = T) %>%
    distinct(ClusterID, AttendanceStartDate, is_viral_hep)

death_filt <- age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    mutate(LinkedDeathdate = ifelse(LinkedDeathdate == "NULL", "3000-01-01 00:00:00.000", LinkedDeathdate)) %>%
    mutate(death = ifelse(as.Date(LinkedDeathdate) <=  as.Date(AttendanceEndDate) &
                              as.Date(LinkedDeathdate) >=  as.Date(AttendanceStartDate),
                          T, F)) %>%
    select(ClusterID, AttendanceStartDate, death, age_group)

spell_filt <- spell_df %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n = sum(SpellIncludesCriticalCareAdmission)) %>%
    mutate(is_icu = ifelse(n > 0, T, F))

admit_filt <- spell_df %>%
    distinct(AttendanceStartDate, ClusterID, .keep_all = T) %>%
    filter(DischargeDate != "NULL") %>%
    mutate(duration = difftime(DischargeDate, AdmissionDate, units = "hours"))

In [None]:
test <- age_df %>%
    distinct(ClusterID, AttendanceStartDate) %>%
    left_join(diag_df %>% 
             mutate(unknown = ifelse(DiagCode %in% unspecified, 1, 0),
                    known = ifelse(!(DiagCode %in% unspecified), 1, 0)) %>%
             select(ClusterID, AttendanceStartDate, known, unknown)) %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n_known = sum(known),
              n_unknown = sum(unknown)) %>%
    filter(!is.na(n_known), !is.na(n_unknown))


In [None]:
391 / 903433 * 100 

In [None]:
unspecified

In [None]:
diag_df

In [None]:
diag_df %>%
    group_by(age_group) %>%
    summarise(n = n())

In [None]:
diag_df %>% 
    mutate(is_unknown = ifelse(DiagCode %in% unspecified, T, F)) %>%
#     filter(DiagCode %in% c("B162", "B169", "B180", "B181")) %>%
    filter(DiagCode %in% c("B171", "B182")) %>%
    mutate(age_group = factor(age_group, c("<7", "7-15", ">15"))) %>%
    group_by(age_group) %>%
    summarise(n = n())


In [None]:
diag %>% 
    group_by(age_group, 
             only_unknown) %>%
    summarise(n = n())

In [None]:
diag_df %>% 
    mutate(unknown = ifelse(DiagCode %in% unspecified, 1, 0),
           known = ifelse(!(DiagCode %in% unspecified), 1, 0)) %>%

    summarise(n_known = sum(known),
              n_unknown = sum(unknown))

diag_df %>% 
    mutate(unknown = ifelse(DiagCode %in% unspecified, 1, 0),
           known = ifelse(!(DiagCode %in% unspecified), 1, 0)) %>%
    group_by
    summarise(n_known = sum(known),
              n_unknown = sum(unknown))

In [None]:
table(adeno_filt$adeno_pos)

In [None]:
test_names <- unique(micro_df$MicroTestName)
HBV_tests <- test_names[grepl("HB", test_names, ignore.case = T)]

In [None]:
diag_filt %>%
    left_join(test_filt) %>%
#     filter(only_unknown) %>%
    group_by(only_unknown, is_acute_hep) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = only_unknown, y = n, fill = is_acute_hep)) +
    geom_bar(stat = "identity", position = "dodge")

In [None]:
# diag_filt %>%
#     left_join(micro_df) %>%
#     select(ClusterID, AttendanceStartDate, only_unknown, BatTestCode, MicroTestName, ResultFull, BugName) %>%
#     filter(!is.na(MicroTestName)) %>%
#     filter(!grepl("not detected|no microorganisms detected|no viruses detected|DO NOT REPORT", ResultFull, ignore.case = T) &
#               grepl("detected", ResultFull, ignore.case = T)) %>%
#     distinct(MicroTestName, BatTestCode) %>%
#     arrange(BatTestCode)

In [None]:
adeno_tests

In [None]:
# diag_filt %>%
#     left_join(micro_df) %>%
#     select(ClusterID, AttendanceStartDate, only_unknown, BatTestCode, MicroTestName, ResultFull, BugName) %>%
#     filter(!is.na(MicroTestName)) %>%
#     filter(!(BatTestCode %in% c("EBNA"))) %>%
#     mutate(ResultFull = ifelse(BatTestCode %in% c('CV2G', 'CV2G2', 'CV2P', 'CV2Q', 'CV2V', 'RCV2Q', 'RCV2V') &
#                                    !grepl("not detected| |NONE|indeterminate", ResultFull, ignore.case = T),
#                                "detected", ResultFull)) %>%
#     filter(!grepl("not detected|no microorganisms detected|no viruses detected|DO NOT REPORT|NO BACTERIA OR VIRUSES DETECTED", ResultFull, ignore.case = T) &
#               grepl("detected", ResultFull, ignore.case = T)) %>%
#     mutate(parsed_results = case_when(grepl("CMV|HSV|EBV|HGPC|VZV", BatTestCode, ignore.case = T)|
#                                           (BatTestCode %in% c("VPCR", "OPCR") & 
#                                               grepl("Herpes Simplex Virus type 1 DETECTED", ResultFull, ignore.case = T))~ "Herpesvirus",
#                                       BatTestCode %in% c('CV2G', 'CV2G2', 'CV2P', 'CV2Q', 'CV2V', 'RCV2Q', 'RCV2V')|
#                                           grepl("Cov-2", MicroTestName, ignore.case = T) ~ "SARS-CoV-2",
#                                       grepl("HAV", BatTestCode) ~ "HAV",
#                                       grepl("HEV", BatTestCode) ~ "HEV",
#                                       grepl("HCV|HCAD", BatTestCode) ~ "HCV",
#                                       grepl("HBE|HBS|HBV|HBC", BatTestCode, ignore.case = T) ~ "HBV",
#                                       grepl("QADN", BatTestCode, ignore.case = T)|
#                                           (BatTestCode %in% adeno_tests$BatTestCode & 
#                                               grepl("adenovirus detected", ResultFull, ignore.case = T)) ~ "adenovirus"
#                                       )) %>%
#     mutate(parsed_results = ifelse(BugName == "adenovirus", "adenovirus", parsed_results)) %>%
#     filter(!is.na(parsed_results)) %>%
#     group_by(ClusterID, AttendanceStartDate, only_unknown) %>%
#     summarise(n_hepAtoE = sum(grepl("HAV|HBV|HCV|HEV", parsed_results)),
#               n_helper = sum(grepl("adenovirus|Herpesvirus", parsed_results)),
#               n_adeno = sum(grepl("adenovirus", parsed_results))) %>%
#     filter(n_helper > 0,
#            n_hepAtoE == 0,
#            only_unknown)

In [None]:
# Tested for adenovirus
merged_df <- adeno_filt %>%
    left_join(diag_filt) %>%
    left_join(test_filt) %>%
    left_join(death_filt) %>%
    left_join(spell_filt) %>%
    left_join(admit_filt) %>%
    mutate(aHep_or_unknDiag = is_acute_hep | only_unknown,
           age_group = factor(age_group, c("<7", "7-15", ">15")),
           is_outbreak = ifelse(AttendanceStartDate <= as.Date("2022-08-31") & AttendanceStartDate >= as.Date("2021-10-01"),
                                T, F),
           is_admitted = ifelse(is.na(AdmissionDate), F, T),
           only_unknown = ifelse(is.na(only_unknown), F, only_unknown),
           is_icu = ifelse(is.na(is_icu), F, is_icu),
           death = ifelse(is.na(death), F, death))

# Whole population
merged_df2 <- age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    left_join(adeno_filt) %>%
    left_join(spell_filt) %>%
    left_join(death_filt) %>%
    left_join(admit_filt) %>%
    left_join(diag_filt) %>%
    mutate(age_group = factor(age_group, c("<7", "7-15", ">15")),
           is_outbreak = ifelse(AttendanceStartDate <= as.Date("2022-08-31") & AttendanceStartDate >= as.Date("2021-10-01"),
                                T, F),
           only_unknown = ifelse(is.na(only_unknown), F, only_unknown),
           is_admitted = ifelse(is.na(AdmissionDate), F, T),
           is_icu = ifelse(is.na(is_icu), F, is_icu),
           death = ifelse(is.na(death), F, death)) %>%
    select(ClusterID, AttendanceStartDate, age_group, 
           is_outbreak, only_unknown, is_admitted, 
           is_icu, death, duration,
           adeno_pos)

# Diagnosed only
merged_df3 <- diag_filt %>%
    left_join(spell_filt) %>%
    left_join(death_filt) %>%
    left_join(admit_filt) %>%
    left_join(diag_filt) %>%
    mutate(age_group = factor(age_group, c("<7", "7-15", ">15")),
           is_outbreak = ifelse(AttendanceStartDate <= as.Date("2022-08-31") & AttendanceStartDate >= as.Date("2021-10-01"),
                                T, F),
           only_unknown = ifelse(is.na(only_unknown), F, only_unknown),
           is_admitted = ifelse(is.na(AdmissionDate), F, T),
           is_icu = ifelse(is.na(is_icu), F, is_icu),
           death = ifelse(is.na(death), F, death)) %>%
    select(ClusterID, AttendanceStartDate, age_group, is_outbreak, only_unknown, is_admitted, is_icu, death, duration)

### Summary statistics

In [None]:
# IMD score for entire population
age_df %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    group_by(age_group) %>%
    mutate(IMDScore = as.numeric(IMDScore)) %>%
    filter(!is.na(IMDScore)) %>%
    summarise(median_IMD = median(IMDScore))

In [None]:
# Admission stats for entire population
merged_df2 %>% nrow()
merged_df2 %>%
    group_by(age_group) %>%
    summarise(
#               n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
#               n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
#               n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Admission stats for patients tested for adenovirus
merged_df %>% nrow()
merged_df %>%
    summarise(
#               n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
#               n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
#               n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Admission stats for patients untested for adenovirus
merged_df2 %>% 
    filter(is.na(adeno_pos)) %>%
           nrow()
merged_df2 %>%
    filter(is.na(adeno_pos)) %>%
    summarise(
#               n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
#               n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
#               n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Admission stats for adeno-pos patients
merged_df %>% 
    filter(adeno_pos) %>%
    nrow()
merged_df %>%
    filter(adeno_pos) %>%
    summarise(
              n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
              n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
              n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Vital signs for adeno-pos patients
adeno_filt %>%
    left_join(death_filt) %>%
    filter(adeno_pos) %>%
    left_join(vital_df) %>%
    mutate(EventResult = as.numeric(EventResult),
           age_group = factor(age_group, c("<7", "7-15", ">15"))) %>%
    filter(!is.na(EventResult)) %>%
    group_by(EventName, age_group) %>%
    summarise(median_value = median(EventResult),
              low_IQR = quantile(EventResult, prob = c(0.25)),
              high_IQR = quantile(EventResult, prob = c(0.75)))
    

In [None]:
# Admission stats for adeno-neg patients
merged_df %>% 
    filter(!adeno_pos) %>%
    nrow()
merged_df %>%
    filter(!adeno_pos) %>%
    summarise(
              n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
              n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
              n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Admission stats for all diagnosed patients
merged_df3 %>% 
    nrow()
merged_df3 %>%
    summarise(
              n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
              n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
              n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Admission stats for AHUA patients
merged_df2 %>% 
    filter(only_unknown) %>%
    nrow()
merged_df2 %>%
    filter(only_unknown) %>%
    summarise(
              n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
              n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
              n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Admission stats for non-AHUA patients
merged_df2 %>% 
    filter(!only_unknown) %>%
    nrow()
merged_df2 %>%
    filter(!only_unknown) %>%
    summarise(
#               n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
#               n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
#               n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Admission stats for viral hep A-E patients
merged_df3 %>% 
    filter(!only_unknown) %>%
    nrow()
merged_df3 %>%
    filter(!only_unknown) %>%
    summarise(
#               n_death = sum(death),
              prop_death = round(sum(death) / n() * 100, 2),
#               n_icu = sum(is_icu),
              prop_icu = round(sum(is_icu) / n() * 100, 1),
#               n_admitted = sum(is_admitted),
              prop_admitted = round(sum(is_admitted) / n() * 100, 1),
              median_duration = round(median(duration, na.rm = T), 0),
              low_duration = round(quantile(duration, c(0.25), na.rm = T), 0),
              high_duration = round(quantile(duration, c(0.75), na.rm = T), 0),
              total = n())

In [None]:
# Adenovirus infections
merged_df %>%
    filter(adeno_pos) %>%
    group_by(age_group) %>%
    summarise(n_death = sum(death),
              prop_death = sum(death) / n(),
              n_icu = sum(is_icu),
              prop_icu = sum(is_icu / n()),
              n_admitted = sum(is_admitted),
              prop_admitted = sum(is_admitted / n()),
              median_duration = median(duration, na.rm = T),
              low_duration = quantile(duration, c(0.25), na.rm = T),
              high_duration = quantile(duration, c(0.75), na.rm = T),
              total = n())

### AHUA versus population

In [None]:
# ICU
merged_df2_within <- merged_df2 %>% filter(is_outbreak)
merged_df2_outside <- merged_df2 %>% filter(!is_outbreak)

cont <- table(merged_df2_within$only_unknown, merged_df2_within$is_icu)
fisher.test(cont)

cont <- table(merged_df2_outside$only_unknown, merged_df2_outside$is_icu)
fisher.test(cont)

## Admission
# cont <- table(merged_df2_within$only_unknown, merged_df2_within$is_admitted)
# fisher.test(cont)

# cont <- table(merged_df2_outside$only_unknown, merged_df2_outside$is_admitted)
# fisher.test(cont)

# Death
cont <- table(merged_df2_within$only_unknown, merged_df2_within$death)
fisher.test(cont)

cont <- table(merged_df2_outside$only_unknown, merged_df2_outside$death)
fisher.test(cont)

In [None]:
merged_filt2_within <- merged_df2 %>%
    filter(!is.na(duration)) %>%
    filter(is_outbreak)

merged_filt2_outside <- merged_df2 %>%
    filter(!is.na(duration)) %>%
    filter(!is_outbreak)


merged_filt2_within %>%
    ggplot(aes(x = only_unknown, y = duration, fill = only_unknown)) +
    geom_boxplot() +
    stat_compare_means()

merged_filt2_outside %>%
    ggplot(aes(x = only_unknown, y = duration, fill = only_unknown)) +
    geom_boxplot() +
    stat_compare_means()

## AHUA versus viral hep A-E

In [None]:
# ICU
merged_df3_within <- merged_df3 %>% filter(is_outbreak)
merged_df3_outside <- merged_df3 %>% filter(!is_outbreak)

cont <- table(merged_df3_within$only_unknown, merged_df3_within$is_icu)
fisher.test(cont)

cont <- table(merged_df3_outside$only_unknown, merged_df3_outside$is_icu)
fisher.test(cont)

## Admission
# cont <- table(merged_df2_within$only_unknown, merged_df2_within$is_admitted)
# fisher.test(cont)

# cont <- table(merged_df2_outside$only_unknown, merged_df2_outside$is_admitted)
# fisher.test(cont)

# Death
cont <- table(merged_df3_within$only_unknown, merged_df3_within$death)
fisher.test(cont)

cont <- table(merged_df3_outside$only_unknown, merged_df3_outside$death)
fisher.test(cont)

### Adenovirus infections

In [None]:
merged_stats <- merged_df %>%
    group_by(adeno_pos, is_outbreak) %>%
    summarise(prop_icu = sum(is_icu, na.rm = T) / n_distinct(ClusterID, AttendanceStartDate),
              prop_death = sum(death, na.rm = T) / n_distinct(ClusterID, AttendanceStartDate),
              prop_admit = sum(is_admitted) / n_distinct(ClusterID, AttendanceStartDate),
              median_duration = median(duration, na.rm = T))
merged_stats

merged_stats %>%
    select(-median_duration) %>%
    pivot_longer(!c("adeno_pos", "is_outbreak"), names_to = "stat", values_to = "value") %>%
    ggplot(aes(x = stat, y = value, fill = adeno_pos)) +
    geom_bar(stat = "identity", position = "dodge") +
    facet_grid(rows = vars(is_outbreak))

In [None]:
# Fisher exact test for admissions
print("ADMISSIONS")
test <- merged_df %>%
    filter(!is.na(is_admitted)) %>%
    filter(!is_outbreak) %>%
    select(is_admitted, adeno_pos)
table(test)
fisher.test(table(test))

test <- merged_df %>%
    filter(!is.na(is_admitted)) %>%
    filter(is_outbreak) %>%
    select(is_admitted, adeno_pos)
table(test)
fisher.test(table(test))

# ICU
print("ICU")
test <- merged_df %>%
    filter(!is.na(is_icu)) %>%
    filter(!is_outbreak) %>%
    select(is_icu, adeno_pos)
table(test)
fisher.test(table(test))

test <- merged_df %>%
    filter(!is.na(is_icu)) %>%
    filter(is_outbreak) %>%
    select(is_icu, adeno_pos)
table(test)
fisher.test(table(test))

# Death
print("Death")
test <- merged_df %>%
    filter(!is.na(death)) %>%
    filter(!is_outbreak) %>%
    select(death, adeno_pos)
table(test)
fisher.test(table(test))$p.value
            
test <- merged_df %>%
    filter(!is.na(death)) %>%
    filter(is_outbreak) %>%
    select(death, adeno_pos)
table(test)
fisher.test(table(test))$p.value

In [None]:
merged_df %>%
    filter(!is.na(duration)) %>%
    filter(is_outbreak) %>%
    ggplot(aes(x = adeno_pos, y = duration, fill = adeno_pos)) +
#     geom_density(alpha = 0.5) +
    geom_boxplot() +
    stat_compare_means()

### Diagnostic codes

In [None]:
diag_merged <- diag_filt %>%
    left_join(test_filt) %>%
    left_join(death_filt) %>%
    left_join(spell_filt) %>%
    left_join(admit_filt) %>%
    mutate(aHep_or_unknDiag = is_acute_hep | only_unknown,
           age_group = factor(age_group, c("<7", "7-15", ">15")),
           is_outbreak = ifelse(AttendanceStartDate <= as.Date("2022-08-31") & AttendanceStartDate >= as.Date("2021-10-01"),
                                T, F),
           is_admitted = ifelse(is.na(AdmissionDate), F, T))

diag_merged %>%
    filter(only_unknown) %>%
    group_by(age_group) %>%
    summarise(n_outbreak = sum(is_outbreak),
              n_total =  n(), 
              perc = sum(is_outbreak) / n())


In [None]:
diag_stats <- diag_merged %>%
    filter(!is.na(only_unknown)) %>%
    group_by(is_outbreak, only_unknown) %>%
    summarise(prop_icu = sum(is_icu, na.rm = T) / n_distinct(ClusterID, AttendanceStartDate),
              prop_death = sum(death, na.rm = T) / n_distinct(ClusterID, AttendanceStartDate),
              median_duration = median(duration, na.rm = T)) 

diag_stats %>%
    select(-median_duration) %>%
    pivot_longer(!c("only_unknown", "is_outbreak"), names_to = "stat", values_to = "value") %>%
    ggplot(aes(x = stat, y = value, fill = is_outbreak)) +
    geom_bar(stat = "identity", position = "dodge") +
    facet_grid(rows = vars(only_unknown)) +
    scale_y_continuous(sec.axis = sec_axis(trans = ~., name="Liver-related disease unknown cause"))


In [None]:
diag_merged


In [None]:
# ICU
print("ICU")
test <- diag_merged %>%
    filter(!is.na(is_icu)) %>%
    filter(is_outbreak) %>%
    select(is_icu, only_unknown)
table(test)
fisher.test(table(test))

test <- diag_merged %>%
    filter(!is.na(is_icu)) %>%
    filter(!is_outbreak) %>%
    select(is_icu, only_unknown)
table(test)
fisher.test(table(test))

# Death
print("Death")
test <- diag_merged %>%
    filter(!is.na(death)) %>%
    filter(is_outbreak) %>%
    select(death, only_unknown)
table(test)
fisher.test(table(test))
            
test <- diag_merged %>%
    filter(!is.na(death)) %>%
    filter(!is_outbreak) %>%
    select(death, only_unknown)
table(test)
fisher.test(table(test))

In [None]:
# ICU
print("ICU")
test <- merged_df %>%
    filter(!is.na(only_unknown)) %>%
    filter(!is_outbreak) %>%
    select(is_icu, only_unknown)
table(test)
fisher.test(table(test))

test <- merged_df %>%
    filter(!is.na(only_unknown)) %>%
    filter(is_outbreak) %>%
    select(is_icu, only_unknown)
table(test)
fisher.test(table(test))


# Death
print("DEATH")
test <- merged_df %>%
    filter(!is.na(death)) %>%
    filter(!is_outbreak) %>%
    select(death, only_unknown)
table(test)
fisher.test(table(test))

test <- merged_df %>%
    filter(!is.na(death)) %>%
    filter(is_outbreak) %>%
    select(death, only_unknown)
table(test)
fisher.test(table(test))

In [None]:
diag_merged %>%
    filter(!is.na(duration)) %>%
    filter(is_outbreak) %>%
    ggplot(aes(x = only_unknown, y = duration, fill = only_unknown)) +
#     geom_density(alpha = 0.5) +
    geom_boxplot() +
    stat_compare_means()

diag_merged %>%
    filter(!is.na(duration)) %>%
    filter(!is_outbreak) %>%
    ggplot(aes(x = only_unknown, y = duration, fill = only_unknown)) +
#     geom_density(alpha = 0.5) +
    geom_boxplot() +
    stat_compare_means()

In [None]:
# Vital signs
adeno_filt %>%
    left_join(death_filt) %>%
    filter(adeno_pos) %>%
    left_join(vital_df) %>%
    mutate(EventResult = as.numeric(EventResult),
           age_group = factor(age_group, c("<7", "7-15", ">15"))) %>%
    filter(!is.na(EventResult)) %>%
    group_by(EventName, age_group) %>%
    summarise(median_value = median(EventResult),
              low_IQR = quantile(EventResult, prob = c(0.25)),
              high_IQR = quantile(EventResult, prob = c(0.75)))
    

# SCRATCH

In [None]:
# death_filt %>% 
#     right_join(test_df) %>%
#     left_join(micro_df %>% select(ClusterID, AttendanceStartDate, BatTestCode, ResultFull, BugName)) %>% 
#     filter(TestName %in% c("ALT", "AST")) %>%
#     mutate(Value = as.numeric(Value)) %>%
#     filter(!is.na(Value)) %>%
#     filter(Value > 500) %>%
#     filter(AttendanceStartDate >= as.Date("2021-10-01")) %>%
#     filter(age_group != ">15") %>%
#     arrange(AttendanceStartDate) %>%
#     filter(BugName == "NULL") %>%
#     distinct(ResultFull)

In [None]:
# death_filt %>% 
#     right_join(test_df) %>%
#     left_join(micro_df %>% select(ClusterID, AttendanceStartDate, BatTestCode, ResultFull, BugName)) %>% 
#     filter(TestName %in% c("ALT", "AST")) %>%
#     mutate(Value = as.numeric(Value)) %>%
#     filter(!is.na(Value)) %>%
#     filter(Value > 500) %>%
#     filter(AttendanceStartDate >= as.Date("2021-10-01")) %>%
#     filter(age_group != ">15") %>%
#     arrange(AttendanceStartDate) %>%
#     filter(!grepl("Not detected|no significant|no growth|No microorganisms|none|NA|no serological|do not report|NO METHICILLIN RESISTANT|not seen", ResultFull, ignore.case = T)) %>%
#     distinct(ResultFull)

In [None]:
# death_filt %>% 
#     right_join(test_df) %>%
#     left_join(micro_df %>% select(ClusterID, AttendanceStartDate, BatTestCode, ResultFull, BugName)) %>% 
#     filter(TestName %in% c("ALT", "AST")) %>%
#     mutate(Value = as.numeric(Value)) %>%
#     filter(!is.na(Value)) %>%
#     filter(Value > 500) %>%
# #     filter(AttendanceStartDate >= as.Date("2021-10-01")) %>%
#     filter(age_group != ">15") %>%
#     mutate(has_bug = ifelse(grepl("NOT TESTED|test declined|unsuitable sample|Sample does not meet the criteria|Not detected|no significant|no growth|No microorganisms|none|NA|no serological|do not report|NO METHICILLIN RESISTANT|not seen", 
#                                   ResultFull, 
#                                   ignore.case = T),
#                             F, T))
# #     distinct(ClusterID, AttendanceStartDate)
#     group_by(AttendanceStartDate, ClusterID) %>%
#     summarise(n_bugs = sum(has_bug)) %>%
#     filter()

In [None]:
covid_filt <- micro_df %>%
  filter(BatTestCode %in% c('CV2G', 'CV2G2', 'CV2P', 'CV2Q', 'CV2V', 'RCV2Q', 'RCV2V')) %>%
  mutate(is_covid = ifelse(grepl("not detected| |NONE|indeterminate", ResultFull, ignore.case = T), F, T)) %>%
    group_by(ClusterID, AttendanceStartDate) %>%
    summarise(n_pos = sum(is_covid)) %>%
    mutate(is_covid = n_pos > 0) %>%
    select(-n_pos) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth))

covid_filt

### AHUA cases after accounting for COVID

In [None]:
# test_filt %>% 
#     left_join(viral_hep_filt) %>%
#     left_join(diag_filt) %>%
#     filter(!is.na(only_unknown))

In [None]:
test_filt %>% 
    left_join(viral_hep_filt) %>%
    left_join(diag_filt) %>%
    filter(!is.na(only_unknown)) %>%
    mutate(is_viral_hep = ifelse(is.na(is_viral_hep), F, is_viral_hep)) %>%
#     filter(is_acute_hep) %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth)) %>%
    group_by(AttendanceMonth) %>%
    summarise(n = sum(!is_viral_hep & is_acute_hep) / n()) %>%
    ggplot(aes(x = AttendanceMonth, y = n)) +
    geom_point() + geom_line()

In [None]:
viral_hep <- test_filt %>% 
    left_join(viral_hep_filt) %>%
    left_join(diag_filt) %>%
    filter(is.na(only_unknown)) %>%
    mutate(is_viral_hep = ifelse(is.na(is_viral_hep), F, is_viral_hep)) %>%
    filter(is_acute_hep, is_viral_hep) %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth)) %>%
    group_by(AttendanceMonth) %>%
    summarise(n_viral = n())

unknown_hep <- test_filt %>% 
    left_join(viral_hep_filt) %>%
    left_join(diag_filt) %>%
    filter(is.na(only_unknown)) %>%
    mutate(is_viral_hep = ifelse(is.na(is_viral_hep), F, is_viral_hep)) %>%
    filter(is_acute_hep, !is_viral_hep) %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth)) %>%
    group_by(AttendanceMonth) %>%
    summarise(n_unknown = n())

unknown_hep %>%
    left_join(viral_hep) %>%
    mutate(n_viral = ifelse(n_viral =))
    mutate(ratio = n_unknown / n_viral)

In [None]:
covid_plt <- covid_filt %>%
    group_by(AttendanceMonth) %>%
    summarise(prop = sum(is_covid) / n()) %>%
     ggplot(aes(x = AttendanceMonth, y = prop)) +
        geom_point() +
        geom_line() +
            annotate("rect",
             xmin = as.Date(-Inf), xmax = as.Date("2020-03-01"), 
             ymin = -Inf, ymax = Inf,
             fill = "grey83",
             alpha = 0.3) +
            annotate("rect",
                     xmin = as.Date("2020-03-01"), xmax = as.Date(Inf), 
                     ymin = -Inf, ymax = Inf,
                     fill = "lightgoldenrod1",
                     alpha = 0.3) +
            annotate("rect",
                 xmin = as.Date("2021-10-01"), xmax = as.Date("2022-08-01"), 
                 ymin = -Inf, ymax = Inf,
                 fill = NA,
                 color = "black",
                 lty = "dashed",
                 alpha = 0.3) +
        scale_x_date(date_labels="%b-%y", date_breaks  = "2 month", limits = c(as.Date("2016-02-01"), 
                                                                               as.Date("2023-01-01"))) +
            theme_bw() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

hep_plt <- test_filt %>% 
    left_join(viral_hep_filt) %>%
    left_join(diag_filt) %>%
    left_join(covid_filt) %>%
    filter(is.na(only_unknown)) %>%
    mutate(is_viral_hep = ifelse(is.na(is_viral_hep), F, is_viral_hep),
           is_covid = ifelse(is.na(is_covid), F, is_covid)) %>%
    filter(is_acute_hep) %>%
    distinct(ClusterID, AttendanceStartDate, .keep_all = T) %>%
    mutate(AttendanceMonth = cut(AttendanceStartDate, breaks = "month"),
           AttendanceTwoMonth = cut(AttendanceStartDate, breaks = "2 months")) %>%
    mutate(AttendanceMonth = as.Date(AttendanceMonth)) %>%
    group_by(AttendanceMonth, is_viral_hep) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = AttendanceMonth, y = n)) +
    geom_point() +
    geom_line() +
    facet_grid(rows = vars(is_viral_hep), scales = "free") +
        annotate("rect",
         xmin = as.Date(-Inf), xmax = as.Date("2020-03-01"), 
         ymin = -Inf, ymax = Inf,
         fill = "grey83",
         alpha = 0.3) +
        annotate("rect",
                 xmin = as.Date("2020-03-01"), xmax = as.Date(Inf), 
                 ymin = -Inf, ymax = Inf,
                 fill = "lightgoldenrod1",
                 alpha = 0.3) +
        annotate("rect",
             xmin = as.Date("2021-10-01"), xmax = as.Date("2022-08-01"), 
             ymin = -Inf, ymax = Inf,
             fill = NA,
             color = "black",
             lty = "dashed",
             alpha = 0.3) +
        scale_x_date(date_labels="%b-%y", date_breaks  = "2 month", limits = c(as.Date("2016-02-01"), 
                                                                               as.Date("2023-01-01"))) +
        theme_bw()

ggarrange(covid_plt, hep_plt, nrow = 2)

In [None]:
adeno_filt %>%
    left_join(death_filt) %>%
    group_by(age_group) %>%
    summarise(n = n())

diag_filt %>%
    left_join(death_filt) %>%
    group_by(age_group) %>%
    summarise(n = n())