# Compare SQL phenotype to R phenotype

<div class="alert alert-block alert-success">
    <b>There are some logic changes that will affect <i>which of a person's measurements</i> is used.</b> And <i>which measurement</i> is used will affect the <b>age</b>, since its age at time of measurement, and the <b>statin use indicator</b>, since the measurment must occur with in the statin use interval to be true.
    <ol>
        <li>We now retain only measurements where <kbd>value_as_number IS NOT NULL AND value_as_number > 0</kbd>.</li>
        <li>Previously the R code was modifying LDL during the lipids adjustment. Now LDL is the original value from the measurements table. Adjustments only occur within LDL_adjusted.
        <li>A single age and statin use indicator was previously chosen per person, even though those values could vary between a person's different lipid measurements. Now each measurement is retaining the age and statin use flag associated with the datetime of the measurment.</li>
        <li>When choosing the "most recent" measurement, the SQL code goes to greater lengths to make the result reproducible by sorting not only by measurement date, but also by measurement time, and measurement id in the case of ties.</li>
        <li>The SQL JOIN logic for measurements and statin use intervals uses the datetime instead of the date.</li>
    </ol>
 </div>

# Setup

In [None]:
lapply(c('hexbin', 'hrbrthemes', 'viridis'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(hexbin)
library(hrbrthemes)
library(tidyverse)

In [None]:
AOU_R_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/AOU_Full_Data.csv'
AOU_SQL_PHENO <- 'gs://fc-secure-440c511e-7fff-417c-9c86-f8ab51bfc618/data/phenotypes/20211005/AOU_Full_Data_iteration3.csv'

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

# Load data

In [None]:
aou_orig_pheno_wide = read_csv(pipe(str_glue('gsutil cat {AOU_R_PHENO}')), na = c("-Inf", "NA"))

In [None]:
colnames(aou_orig_pheno_wide)

In [None]:
aou_orig_pheno_long <- aou_orig_pheno_wide %>%
    filter(!is.na(Cohort)) %>% # Samples without a value for Cohort do not have WGS.
    select(person_id, age, statin_use, LDL, HDL, TC, TG,
           LDL_adjusted = LDLadjusted, TC_adjusted = TCadjusted, TG_adjusted = TGadjusted) %>%
    pivot_longer(col = c(LDL, HDL, TC, TG, LDL_adjusted, TC_adjusted, TG_adjusted),
                 names_to = 'lipid_type',
                 values_to = 'value_as_number') %>%
    # Remove non-useful lipid values here as a post processing step.
    filter(!is.na(value_as_number)) %>%
    filter(value_as_number > 0)

In [None]:
aou_new_pheno_long = read_csv(pipe(str_glue('gsutil cat {AOU_SQL_PHENO}')))

In [None]:
colnames(aou_new_pheno_long)

# Compare data

In [None]:
dim(aou_orig_pheno_long)
dim(aou_new_pheno_long)

<div class="alert alert-block alert-success">
We've retained more non-zero and non-null measurements.
</div>

In [None]:
length(unique(aou_orig_pheno_long$person_id))
length(unique(aou_new_pheno_long$person_id))

<div class="alert alert-block alert-success">
We've also included more genomes.
</div>

In [None]:
pheno_versions <- inner_join(
    aou_new_pheno_long,
    aou_orig_pheno_long,
    suffix = c('_sql_phenotypes', '_r_phenotypes'),
    by = c('person_id', 'lipid_type')
)

dim(pheno_versions)

In [None]:
stopifnot(nrow(aou_orig_pheno_long) == nrow(pheno_versions))

In [None]:
colnames(pheno_versions)

In [None]:
sum(abs(pheno_versions$age_sql_phenotypes - pheno_versions$age_r_phenotypes) > 2)

In [None]:
sum(pheno_versions$statin_use_sql_phenotypes != pheno_versions$statin_use_r_phenotypes)

<div class="alert alert-block alert-success">
The results have minor differences, but no major differences.
</div>

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)

ggplot(pheno_versions, aes(x=age_sql_phenotypes, y=age_r_phenotypes)) + geom_hex(bins=100)

In [None]:
options(repr.plot.width = 14, repr.plot.height = 30)
rbind(
    aou_new_pheno_long %>%
        mutate(version = 'sql_phenotypes') %>%
        select(person_id, age, statin_use, lipid_type, value_as_number, version),
    aou_orig_pheno_long %>%
        mutate(version = 'r_phenotypes') %>%
        select(person_id, age, statin_use, lipid_type, value_as_number, version)) %>%
ggplot(aes(x = version, y = value_as_number)) +
    geom_boxplot() +
    facet_grid(rows = vars(lipid_type), scales = 'free') +
    scale_y_log10()

In [None]:
options(repr.plot.width = 14, repr.plot.height = 30)
rbind(
    aou_new_pheno_long %>%
        mutate(version = 'sql_phenotypes') %>%
        select(person_id, age, statin_use, lipid_type, value_as_number, version),
    aou_orig_pheno_long %>%
        mutate(version = 'r_phenotypes') %>%
        select(person_id, age, statin_use, lipid_type, value_as_number, version)) %>%
ggplot(aes(x = version, y = value_as_number)) +
    geom_violin() +
    facet_grid(rows = vars(lipid_type), scales = 'free') +
    scale_y_log10()

# Provenance

In [None]:
devtools::session_info()