# UK Biobank lipids phenotypes and covariates

In this notebook we review and explore the available UK Biobank data for lipids phenotypes and covariates.

TODOs
* GCP copy of the Nataranjan data contains withdrawn participants
* check that the assays used here are comparable to the data from the AoU measurements
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30690
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30760
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30780
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30870
* use lower and upper bound cutoffs appropriate for each measurement
* also determine the relevant statin phenotypes so that we can correct for statin use:
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20003
* incorporate exclusion criteria

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on Terra. It runs fine on the default Cloud Environment. 
</div>

In [None]:
lapply(c('skimr', 'tidyverse'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(skimr)
library(tidyverse)

In [None]:
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')

HAAS_PHENO_TABLE <- 'single_values_table_ukb9222_20210111'
HAAS_MULTI_INSTANCE_PHENO_TABLE <- 'instance_values_table_ukb9222_20210111'
HAAS_ARRAY_PHENO_TABLE <- 'array_values_table_ukb9222_20210111'

In [None]:
## Plot setup.
theme_set(theme_bw(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Connect to the data

### Natarajan lipids data

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_pheno_raw_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'raw_phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids_pheno_raw')

head(colnames(natarajan_pheno_raw_tbl))

In [None]:
dim(natarajan_pheno_raw_tbl)

In [None]:
str_subset(colnames(natarajan_pheno_raw_tbl), '(?i)age|ldl|hdl|tc|statin|choles|trigly')

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_pheno_qced_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids_pheno_qced')

head(colnames(natarajan_pheno_qced_tbl))

In [None]:
dim(natarajan_pheno_raw_tbl)

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_lipids_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids')

head(colnames(natarajan_lipids_tbl))

In [None]:
dim(natarajan_lipids_tbl)

In [None]:
str_subset(colnames(natarajan_lipids_tbl), '(?i)age|ldl|hdl|tc|statin|choles|trigly')

### Haas albuminuria data

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
haas_pheno_dbcon <- bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                                 dataset = 'pivoted_phenotypes',
                                 billing = BILLING_PROJECT_ID)

In [None]:
haas_pheno_tbl <- dplyr::tbl(haas_pheno_dbcon, HAAS_PHENO_TABLE)

head(colnames(haas_pheno_tbl))

In [None]:
str_subset(colnames(haas_pheno_tbl), '(?i)birth|medication|statin|cholesterol')

In [None]:
haas_instanced_pheno_tbl <- dplyr::tbl(haas_pheno_dbcon, HAAS_MULTI_INSTANCE_PHENO_TABLE)

head(colnames(haas_instanced_pheno_tbl))

In [None]:
str_subset(colnames(haas_instanced_pheno_tbl), '(?i)f_21003_|medication|statin|cholesterol')

In [None]:
haas_arrayed_pheno_tbl <- dplyr::tbl(haas_pheno_dbcon, HAAS_ARRAY_PHENO_TABLE)

head(colnames(haas_arrayed_pheno_tbl))

In [None]:
str_subset(colnames(haas_arrayed_pheno_tbl), '(?i)medication|statin|cholesterol|f_20003_')

## Retrieve the data

In [None]:
colnames(natarajan_lipids_tbl)

In [None]:
pheno <- natarajan_lipids_tbl %>%
    select(eid, ldl, hdl, trig, chol) %>%
    collect() %>%    # <--- the collect() operation is what transfers the data from BigQuery to memory
    inner_join(  # <--- do an inner join so that we drop the withdrawn samples from the Natarajan data
        haas_instanced_pheno_tbl %>%
        select(eid, instanceId, f_21003_Age_when_attended_assessment_centre_years) %>%
        filter(instanceId == 0) %>%
        collect()  # <--- the collect() operation is what transfers the data from BigQuery to memory
    ) 

dim(pheno)

# Pivot and plot the data 

In [None]:
assay <- pheno %>%
    pivot_longer(
        cols = c(ldl, hdl, trig, chol),
        names_to = 'measurement',
        values_to = 'mg_dl')

In [None]:
# Check the result of the join.
(dim(assay))
(nrow(pheno) * 4)
stopifnot(nrow(assay) == nrow(pheno) * 4)

In [None]:
head(assay)

In [None]:
assay %>%
    group_by(measurement) %>%
    summarize(
        count = n(),
        missing = sum(is.na(mg_dl)),
        median = median(mg_dl, na.rm = TRUE),
        mean = mean(mg_dl, na.rm = TRUE),
        stddev = sd(mg_dl, na.rm = TRUE)
    )

In [None]:
options(repr.plot.height = 18, repr.plot.width = 16)

assay %>%
    ggplot(aes(x = cut_width(f_21003_Age_when_attended_assessment_centre_years, width = 10, boundary = 0), y = mg_dl)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
#    scale_y_log10() +  # Uncomment if the data looks skewed.
    coord_flip() +
    facet_wrap(~ measurement, nrow = length(unique(assay$measurement)), scales = 'free_x') +
    xlab('age') +
    labs(title = str_glue('Instance 0 measurement per person, by age'),
         caption = 'Source: UK Biobank data')

# Provenance 

In [None]:
devtools::session_info()