# UK Biobank lipids phenotypes and covariates

In this notebook we review the available UK Biobank data for lipids and prepare our GWAS phenotypes and covariates.
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30690
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30760
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30780
  * https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=30870

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the UK Biobank Research Analysis Platform..
    <ul>
        <li>Use compute type 'Single Node' with sufficient CPU and RAM (e.g. start with 4 CPUs and 15 GB RAM, increase if needed).</li>
        <li>This notebook is pretty fast, but in general it is recommended to be run in the background via <kbd>dx run dxjupyterlab</kbd> to capture provenance.</li>
    </ul>
</div>

```
dx run dxjupyterlab \
    --instance-type=mem2_ssd1_v2_x8 \
    -icmd="papermill 5_ukb_lipids_phenotypes.ipynb 5_ukb_lipids_phenotypes_chr${CHROM}_$(date +%Y%m%d).ipynb -p CHROM ${CHROM}" \
    -iin=5_ukb_lipids_phenotypes.ipynb \
    --folder=outputs/r-prepare-phenotype/$(date +%Y%m%d)/
```
See also https://platform.dnanexus.com/app/dxjupyterlab

In [None]:
lapply(c('lubridate', 'skimr', 'tidyverse'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(lubridate)
library(skimr)
library(tidyverse)

In [None]:
## Plot setup.
theme_set(theme_bw(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

# Inputs
DRUG_MAPPING = '/mnt/project/outputs/spark-pheno-retrieval/20211028/drug_mapping.csv'
PHENO_DATA = '/mnt/project/outputs/spark-pheno-retrieval/20211028/lipids.csv'
WITHDRAWN_EIDS = '/mnt/project/w7089_20210809.csv'
EXOME_EIDS = '/mnt/project/Bulk/Exome\\ sequences/Population\\ level\\ exome\\ OQFE\\ variants\\,\\ PLINK\\ format/ukb23155_c21_b0_v1.fam'
PCS = '/mnt/project/outputs/plink-ld-pca/20211028/ukb_200kwes_lipids_plink_pca.eigenvec'

# Outputs

## Retrieve and load the data extract

<div class="alert alert-block alert-warning">
This section assumes the availability of CSVs created via notebook <kbd>4_ukb_lipids_phenotypes_retrieval.ipynb</kbd> and uploaded to the UKB RAP project.
</div>

In [None]:
# The CSV is well-formed and visible.
system(str_glue('cat {DRUG_MAPPING}'), intern = TRUE)

In [None]:
# But this R command does not work.
read_csv(DRUG_MAPPING)

Instead, copy the file to local disk first. Then read it.

In [None]:
system(str_glue('cp {DRUG_MAPPING} .'), intern=TRUE)
statin_drugs <- read_csv(basename(DRUG_MAPPING))

In [None]:
system(str_glue('cp {WITHDRAWN_EIDS} .'), intern=TRUE)
withdrawn_eids <- read_tsv(basename(WITHDRAWN_EIDS), col_names = c('eid'))

In [None]:
system(str_glue('cp {EXOME_EIDS} .'), intern=TRUE)
exome_eids <- read_delim(basename(EXOME_EIDS), delim = ' ', col_names = c('eid'))

In [None]:
system(str_glue('cp {PCS} .'), intern=TRUE)
pcs <- read_delim(basename(PCS), delim = ' ', col_names = c('eid'))

In [None]:
head(exome_eids)

In [None]:
system(str_glue('cp {PHENO_DATA} .'), intern=TRUE)
raw_pheno <- read_csv(basename(PHENO_DATA))

In [None]:
spec(raw_pheno)

In [None]:
#skim(raw_pheno)

At this time, we are only interested in the first instance.

In [None]:
pheno <- raw_pheno %>% select(eid, p31_Sex, contains('_i0_'))

In [None]:
skim(pheno)

## Omit participants who have withdrawn from the study

In [None]:
nrow(pheno)

In [None]:
excluded <- raw_pheno %>%
    filter(eid %in% withdrawn_eids$eid) %>%
    select(eid) %>%
    mutate(exclusion_reason = 'withdrawn from study')

pheno <- pheno %>%
    filter(!eid %in% excluded$eid)

In [None]:
nrow(pheno)

# Examine the lipids data 

In [None]:
table(pheno$p30692_i0_Cholesterol_aliquot, useNA = 'always')

In [None]:
pheno %>%
    mutate(
        missing = is.na(p30690_i0_Cholesterol_mmol_L)
    ) %>%
    group_by(pheno$p30693_i0_Cholesterol_correction_level, missing) %>%
    summarize(count = n())

In [None]:
pheno %>%
    mutate(
        missing = is.na(p30690_i0_Cholesterol_mmol_L)
    ) %>%
    group_by(p30694_i0_Cholesterol_correction_reason, missing) %>%
    summarize(count = n())

In [None]:
pheno %>%
    mutate(
        missing = is.na(p30690_i0_Cholesterol_mmol_L)
    ) %>%
    group_by(p30695_i0_Cholesterol_missing_reason, missing) %>%
    summarize(count = n())

## Plot the lipids data 

In [None]:
options(repr.plot.height = 12, repr.plot.width = 16)

pheno %>%
    filter(!is.na(p21003_i0_Age_when_attended_assessment_centre_years)) %>%
    ggplot(aes(x = cut_width(p21003_i0_Age_when_attended_assessment_centre_years, width = 10, boundary = 0), y = p30690_i0_Cholesterol_mmol_L)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    coord_flip() +
    xlab('age') +
    labs(title = str_glue('Instance 0 measurement per person, by age'),
         caption = 'Source: UK Biobank data')

In [None]:
pheno_check_long <- pheno %>%
    select(eid, p21003_i0_Age_when_attended_assessment_centre_years, ends_with('mmol_L')) %>%
    pivot_longer(
        cols = ends_with('mmol_L'),
        names_to = c('instance', 'measurement'),
        names_pattern = 'p\\d+_(i\\d+)_(.*)_mmol_L',
        values_to = 'mmol_L') %>%
    # Convert to units used by AoU measurements.
    # Formula is from https://www.ncbi.nlm.nih.gov/books/NBK83505/
    mutate(
        mg_dl = case_when(
            measurement == 'HDL_cholesterol' ~ mmol_L * 38.67,
            measurement == 'Cholesterol' ~ mmol_L * 38.67,
            measurement == 'LDL_direct' ~ mmol_L * 38.67,
            measurement == 'Triglycerides' ~ mmol_L * 88.57,
            TRUE ~ NA_real_
        )
    ) %>%
    inner_join(
      pheno %>%
      select(eid, p21003_i0_Age_when_attended_assessment_centre_years, ends_with('correction_reason')) %>%
      pivot_longer(
          cols = ends_with('correction_reason'),
          names_to = c('instance', 'measurement'),
          names_pattern = 'p\\d+_(i\\d+)_(.*)_correction_reason',
          values_to = 'correction_reason')) %>% 
    inner_join(
      pheno %>%
      select(eid, p21003_i0_Age_when_attended_assessment_centre_years, ends_with('missing_reason')) %>%
      pivot_longer(
          cols = ends_with('missing_reason'),
          names_to = c('instance', 'measurement'),
          names_pattern = 'p\\d+_(i\\d+)_(.*)_missing_reason',
          values_to = 'missing_reason')) %>%
    inner_join(
      pheno %>%
      select(eid, p21003_i0_Age_when_attended_assessment_centre_years, ends_with('correction_level')) %>%
      pivot_longer(
          cols = ends_with('correction_level'),
          names_to = c('instance', 'measurement'),
          names_pattern = 'p\\d+_(i\\d+)_(.*)_correction_level',
          values_to = 'correction_level')) %>%
    inner_join(
      pheno %>%
      select(eid, p21003_i0_Age_when_attended_assessment_centre_years, ends_with('aliquot')) %>%
      pivot_longer(
          cols = ends_with('aliquot'),
          names_to = c('instance', 'measurement'),
          names_pattern = 'p\\d+_(i\\d+)_(.*)_aliquot',
          values_to = 'aliquot')) %>%
    inner_join(
      pheno %>%
      select(eid, p21003_i0_Age_when_attended_assessment_centre_years, ends_with('reportability')) %>%
      pivot_longer(
          cols = ends_with('reportability'),
          names_to = c('instance', 'measurement'),
          names_pattern = 'p\\d+_(i\\d+)_(.*)_reportability',
          values_to = 'reportability'))

In [None]:
# Check the result of the join.
(dim(pheno_check_long))
(nrow(pheno) * 4)
stopifnot(nrow(pheno_check_long) == nrow(pheno) * 4)

In [None]:
# Uncomment the line below to see row level data.
#head(pheno_check_long)

In [None]:
pheno_check_long %>%
    filter(!is.na(mg_dl)) %>%
    group_by(missing_reason) %>%
    summarize(count = n())

In [None]:
pheno_check_long %>%
    group_by(measurement) %>%
    summarize(
        count = n(),
        missing = sum(is.na(mg_dl)),
        median = median(mg_dl, na.rm = TRUE),
        mean = mean(mg_dl, na.rm = TRUE),
        stddev = sd(mg_dl, na.rm = TRUE)
    )

In [None]:
options(repr.plot.height = 18, repr.plot.width = 16)

pheno_check_long %>%
    filter(!is.na(p21003_i0_Age_when_attended_assessment_centre_years)) %>%
    ggplot(aes(x = cut_width(p21003_i0_Age_when_attended_assessment_centre_years, width = 15, boundary = 0), y = mg_dl)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
#    scale_y_log10(breaks = scales::pretty_breaks(n = 10)) +  # Uncomment if the data looks skewed.
    coord_flip() +
    facet_wrap(~ measurement, nrow = length(unique(pheno_check_long$measurement)), scales = 'free_x') +
    xlab('age') +
    labs(title = str_glue('Instance 0 measurement per person, by age'),
         caption = 'Source: UK Biobank data')

# Determine statin use 

In [None]:
pheno %>%
    select(p20003_i0_Treatment_medication_code) %>%
    filter(str_detect(p20003_i0_Treatment_medication_code, 'statin')) %>%
    head()


In [None]:
statin_drugs

In [None]:
(statin_regex = str_c(statin_drugs$drug_name, collapse = '|'))

In [None]:
str_detect(pheno$p20003_i0_Treatment_medication_code[100], statin_regex)

In [None]:
pheno <- pheno %>%
    mutate(
        statin_use = case_when(
            str_detect(p20003_i0_Treatment_medication_code, statin_regex) ~ 1,
            TRUE ~ 0
        )
    )

table(pheno$statin_use, useNA = 'always')

# Age covariate

In [None]:
table(pheno$p21003_i0_Age_when_attended_assessment_centre_years, useNA='always')

In [None]:
nrow(pheno)

In [None]:
excluded <- bind_rows(
    excluded,
    raw_pheno %>%
        filter(is.na(p21003_i0_Age_when_attended_assessment_centre_years)) %>%
        select(eid) %>%
        mutate(exclusion_reason = 'null age')
    )

pheno <- pheno %>%
    filter(!eid %in% excluded$eid) %>%
    mutate(
        age = p21003_i0_Age_when_attended_assessment_centre_years,
        age2 = age ^2
    )

In [None]:
nrow(pheno)

In [None]:
table(pheno$age, useNA='always')

In [None]:
table(pheno$age2, useNA='always')

# Sex covariate

In [None]:
table(pheno$p31_Sex, useNA='always')

In [None]:
nrow(pheno)

In [None]:
excluded <- bind_rows(
    excluded,
    raw_pheno %>%
        filter(is.na(p31_Sex)) %>%
        select(eid) %>%
        mutate(exclusion_reason = 'null sex')
    )

pheno <- pheno %>%
    filter(!eid %in% excluded$eid)

In [None]:
nrow(pheno)

In [None]:
table(pheno$p31_Sex, useNA='always')

## Lipid phenotypes

In [None]:
nrow(pheno)

Retain only participants with values for all four lipids.

In [None]:
excluded <- bind_rows(
    excluded,
    raw_pheno %>%
        filter(is.na(p30690_i0_Cholesterol_mmol_L)
               | is.na(p30780_i0_LDL_direct_mmol_L)
               | is.na(p30760_i0_HDL_cholesterol_mmol_L)
               | is.na(p30870_i0_Triglycerides_mmol_L)) %>%
        select(eid) %>%
        mutate(exclusion_reason = 'null lipid')
    )
    
pheno <- pheno %>%
    filter(!eid %in% excluded$eid)

In [None]:
nrow(pheno)

Convert from mmol/L to mg/dL. Formula is from https://www.ncbi.nlm.nih.gov/books/NBK83505/

In [None]:
pheno <- pheno %>%
    mutate(
        Total_cholesterol_mg_dl = p30690_i0_Cholesterol_mmol_L * 38.67,
        LDL_mg_dl = p30780_i0_LDL_direct_mmol_L * 38.67,
        HDL_cholesterol_mg_dl = p30760_i0_HDL_cholesterol_mmol_L * 38.67,
        Triglycerides_mg_dl = p30870_i0_Triglycerides_mmol_L * 88.57
    )

1.	LDL adjustment based on TG/LDL values 
  1.	`If TG > 400, then LDL = NA`
  2.	`If LDL < 10, then LDL = NA`
2.	LDL and TC adjustment based on Statin (Lipid lowering medication)
  1.	`If STATIN is used, LDL_ADJ = LDL/0.7`
  2.	`If STATIN is used, TOTAL_ADJ = TC/0.8`
3.	TG adjustment
  1.	`TG_LOG = log(TG)`

In [None]:
pheno <- pheno %>%
    mutate(
        LDL_adj_mg_dl = case_when(
            Triglycerides_mg_dl > 400 ~ NA_real_,
            LDL_mg_dl < 10 ~ NA_real_,
            statin_use == 1 ~ LDL_mg_dl / 0.7,
            TRUE ~ LDL_mg_dl
        ),
        Total_cholesterol_adj_mg_dl = ifelse(statin_use == 1, Total_cholesterol_mg_dl / 0.8, Total_cholesterol_mg_dl),
        Triglycerides_log_mg_dl = log(Triglycerides_mg_dl)
    )

In [None]:
skim(pheno)

In [None]:
pheno_long <- pheno %>%
    select(eid, age, age2, p31_Sex, ends_with('mg_dl')) %>%
    pivot_longer(
        cols = ends_with('mg_dl'),
        names_to = c('measurement'),
        names_pattern = '(.*)_mg_dl',
        values_to = 'mg_dl')

In [None]:
pheno_long %>%
    group_by(measurement) %>%
    summarize(
        count = n(),
        missing = sum(is.na(mg_dl)),
        median = median(mg_dl, na.rm = TRUE),
        mean = mean(mg_dl, na.rm = TRUE),
        stddev = sd(mg_dl, na.rm = TRUE)
    )

In [None]:
options(repr.plot.height = 24, repr.plot.width = 16)

pheno_long %>%
    ggplot(aes(x = cut_width(age, width = 15, boundary = 0), y = mg_dl)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
#    scale_y_log10(breaks = scales::pretty_breaks(n = 10)) +  # Uncomment if the data looks skewed.
    coord_flip() +
    facet_wrap(~ measurement, nrow = length(unique(pheno_long$measurement)), scales = 'free_x') +
    xlab('age') +
    labs(title = str_glue('Instance 0 measurement per person, by age'),
         caption = 'Source: UK Biobank data')

## Retain only participants with data in the 200k WES release

In [None]:
nrow(pheno)

In [None]:
pheno <- pheno %>%
    filter(eid %in% exome_eids$eid)

In [None]:
nrow(pheno)

In [None]:
excluded %>%
    group_by(eid) %>%
    mutate(
        exclusion_reasons = str_c(exclusion_reason, collapse = ', ')
    ) %>%
    group_by(exclusion_reasons) %>%
    summarize(
        count = n()
    )

## Normalize lipids values

4.	Calculation of residuals – residuals calculated by adjusting for covariates 
  1.	residual calculation Example for LDL: `tmp.ldl$LDL_ADJ.resid <- resid(lm(LDL_ADJ ~ sex+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+PC11, data = tmp.ldl))`
5.	normalization Example for LDL: `tmp.ldl$LDL_ADJ.norm <- sd(tmp.ldl$LDL_ADJ)*scale(qnorm((rank(tmp.ldl$LDL_ADJ.resid,na.last="keep")-0.5)/length(tmp.ldl$LDL_ADJ.resid)))`

In [None]:
# TODO add PCs

In [None]:
system(str_glue('cp {PCS} .'), intern=TRUE)
pcs <- read_tsv(basename(PCS))

In [None]:
head(pcs)

In [None]:
# Confirm that there are no EIDs in the PCs that are not also in the raw phenotypes (not including the negative EIDs for withdrawn samples).
stopifnot(sum(setdiff(pcs$IID, raw_pheno$eid) > 0) == 0)

PC11????

# Provenance 

In [None]:
devtools::session_info()