# Prepare a lipids phenotype

In this notebook, we use the UK Biobank data to prepare a lipids phenotype adjusted for statin use.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project). Specifically this is for the portion of the project that is the **pooled** analysis since this data will be combined with *All of Us* data in the next notebook, `aou_workbench_pooled_analyses/02_merge_lipids_phenotypes.ipynb`.

Also note that the UK Biobank data we read from BigQuery is not the raw data from UK Biobank. Instead it has had some previous processing performed on it by the Nataranjan lab, such as determining whether or not each UKB participant uses statin medication. If you want to see what the UKB phenotype wrangle looks like when starting from the raw data from UKB Biobank, see notebook `ukb_rap_siloed_analyses/5_ukb_lipids_gwas_phenotype.ipynb`.


<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook only takes a few minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
lapply(c('skimr'), function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )

In [None]:
library(bigrquery)
library(lubridate)
library(skimr)
library(tidyverse)

<div class="alert alert-block alert-info">
<b>Note:</b> The <a href='https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#with_clause'>WITH clause</a> allows you to logically sequence your code. It does this by allowing you to emulate temporary table names that are usable by your main SQL statement so that you can break your code into smaller and easier to understand queries that refer to one another.
</div>

In [None]:
formulate_and_run_multipart_query <- function(subqueries, final_tbl) {
    query <- str_c('WITH\n', str_c(subqueries, collapse = ',\n\n'), str_glue('\n\n\nSELECT * FROM {final_tbl}'))
    message(query)               
    results <- bq_table_download(bq_dataset_query(Sys.getenv('WORKSPACE_CDR'),
                                                  query,
                                                  billing = Sys.getenv('GOOGLE_PROJECT')))
    message(str_glue('Dimensions of result: num_rows={nrow(results)} num_cols={ncol(results)}'))
    return(results)
}          

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# Provided by UKB.
UKB_EID_TO_SAMPLE_ID <- 'gs://uk-biobank-sek-data-us-east1/sample-info/bridge_7089_31063.tsv'
# Provided by UKB.
WITHDRAWN_PARTICIPANTS <- 'uk-biobank-sek-data.exclusions.w7089_20220222'
# Created via notebook aou_workbench_pooled_analyses/matrix_table_creation/get_sample_list_from_matrix_table.ipynb
UKB_200K_EXOME_SAMPLE_IDS <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/ukb/ukb_200k_exome_sample_ids.tsv'
# A BigQuery table of raw UKB application 7089 data.
UKB_INSTANCED_DATA <- 'uk-biobank-sek-data.pivoted_phenotypes.instance_values_table_ukb9222_20210111'
# A BigQuery table of data prepared by the Natarajan lab.
NATARAJAN_DEMOGRAPHICS <- 'uk-biobank-sek-data.raw_phenotypes.lipids_pheno_raw'
# A BigQuery table of data prepared by the Natarajan lab.
NATARAJAN_LIPIDS <- 'uk-biobank-sek-data.phenotypes.lipids'
# Data from https://biobank.ndph.ox.ac.uk/ukb/schema.cgi?id=11
UKB_HIERARCHICAL_INT_CODINGS <- 'uk-biobank-sek-data.metadata.ehierint'

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/ukb/pheno/{DATESTAMP}/')
UKB_PHENOTYPE_FILENAME <- 'ukb_lipids_phenotype.csv'

# UKB 200k exome sample ids

In [None]:
ukb_exome_sample_ids <- read_tsv(pipe(str_glue('gsutil cat {UKB_200K_EXOME_SAMPLE_IDS}')))

dim(ukb_exome_sample_ids)
head(ukb_exome_sample_ids)

# eid to sample id mapping

In [None]:
ukb_eid_to_sample_id <- read_delim(
    pipe(str_glue('gsutil cat {UKB_EID_TO_SAMPLE_ID}')),
    delim = '\t'
)

dim(ukb_eid_to_sample_id)
head(ukb_eid_to_sample_id)

## Limit the mapping to participants with genomic data

In [None]:
ukb_eid_to_sample_id <- ukb_eid_to_sample_id %>%
    filter(eid_31063 %in% ukb_exome_sample_ids$s)

dim(ukb_eid_to_sample_id)

# Demographics

See also:
* https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=21000
* https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=1001

In [None]:
ETHNIC_BACKGROUND_CODING_QUERY <- str_glue ('
--- Return the base level code meanings for ethnic background
base_ethnic_background_coding_tbl AS (
    SELECT
      value,
      meaning,
      if(parent_id != 0, parent_id, value) AS top_level_value
    FROM
        `{UKB_HIERARCHICAL_INT_CODINGS}`
    WHERE
        encoding_id = 1001
),

--- And get the meaning for the top_level_value too.
ethnic_background_coding_tbl AS (
    SELECT
        base.value,
        base.meaning,
        base.top_level_value,
        top_level.meaning AS top_level_meaning
    FROM base_ethnic_background_coding_tbl AS base
    LEFT JOIN base_ethnic_background_coding_tbl AS top_level
    ON base.top_level_value = top_level.value
)')

In [None]:
DEMOGRAPHICS_QUERY <- str_glue('
-- Return row level demographics data for UKB participants.
demographics_tbl AS (
    SELECT
        id AS eid,
        f_21000_Ethnic_background,
        meaning AS ethnic_background,
        top_level_meaning AS top_level_ethnic_background,
        age,
        sex AS sex_at_birth
    FROM
        `{NATARAJAN_DEMOGRAPHICS}`
    LEFT JOIN (
        SELECT
            eid,
            f_21000_Ethnic_background
        FROM
          `{UKB_INSTANCED_DATA}`
        WHERE
          instanceId = 0
    ) ON id = eid
    LEFT JOIN ethnic_background_coding_tbl
    ON f_21000_Ethnic_background = value
    WHERE
        id NOT IN (SELECT * FROM {WITHDRAWN_PARTICIPANTS})
)')

In [None]:
demographics <- formulate_and_run_multipart_query(c(ETHNIC_BACKGROUND_CODING_QUERY, DEMOGRAPHICS_QUERY), 'demographics_tbl')
head(demographics)
dim(demographics)

In [None]:
skim(demographics)

In [None]:
table(demographics$age, useNA = 'always')

In [None]:
table(demographics$sex_at_birth, useNA = 'always')

In [None]:
table(demographics$f_21000_Ethnic_background, useNA = 'always')

In [None]:
table(demographics$ethnic_background, useNA = 'always')

In [None]:
table(demographics$top_level_ethnic_background, useNA = 'always')

# Lipids and statin use data

In [None]:
LIPIDS_QUERY <- str_glue('
lipids_tbl AS (
-- Return row level data for lipids and statin use from the initial study visit for UKB participants.
-- Limit to the participants with non-null values for all four lipids
    SELECT
        eid,
        ldl AS LDL,
        hdl AS HDL,
        chol AS TC,
        trig AS TG,
        statin0 AS statin_use
    FROM
        `{NATARAJAN_LIPIDS}`
    WHERE
        eid NOT IN (SELECT * FROM {WITHDRAWN_PARTICIPANTS})
        -- At least one non-null lipid
        AND (ldl IS NOT NULL
          OR hdl IS NOT NULL
          OR chol IS NOT NULL
          OR trig IS NOT NULL)
)')

In [None]:
wide_lipids_and_statin_use <- formulate_and_run_multipart_query(c(LIPIDS_QUERY), 'lipids_tbl')

In [None]:
dim(wide_lipids_and_statin_use)
head(wide_lipids_and_statin_use)

In [None]:
skim(wide_lipids_and_statin_use)

In [None]:
table(wide_lipids_and_statin_use$statin_use, useNA = 'always')

## Pivot lipids measurements

In [None]:
lipids_and_statin_use <- wide_lipids_and_statin_use %>%
    pivot_longer(
        cols = c('LDL', 'HDL', 'TC', 'TG'),
        names_to = 'lipid_type',
        values_to = 'mg_dl')

In [None]:
lipids_and_statin_use %>%
    group_by(lipid_type) %>%
    summarize(
        num_persons = n_distinct(eid),
        num_measures = n(),
        missing = sum(is.na(mg_dl)),
        median = median(mg_dl, na.rm = TRUE),
        mean = mean(mg_dl, na.rm = TRUE),
        stddev = sd(mg_dl, na.rm = TRUE)
    ) %>%
    arrange(desc(num_persons))    

# Join all the data together

Inner join demographics with lipids and statin use data. Note that the demographics dataframe was filtered to remove participants withdrawn from the study.

Then inner join the eid to sample id map. Note that the mapping was filtered to just those participants with exome data.

In [None]:
lipid_phenotype <- inner_join(demographics, lipids_and_statin_use, by = c('eid' = 'eid')) %>%
    inner_join(ukb_eid_to_sample_id, by = c('eid' = 'eid_7089'))

In [None]:
dim(lipid_phenotype)
head(lipid_phenotype)

# Add age^2 covariate

In [None]:
lipid_phenotype <- lipid_phenotype %>%
    mutate(
        # Age at time of measurement.
        age2 = age^2
    )

# Adjust lipids for statin use

## Adjust LDL for statin use

In [None]:
# LDL adjustment based on TG and LDL values
# If TG > 400, then LDL = NA
# If LDL < 10, then LDL = NA
# If STATIN is used, LDL_ADJ = LDL/0.7

ldl_adjusted_phenotype <- lipid_phenotype %>%
    filter(lipid_type == 'LDL') %>%
    left_join(
        lipid_phenotype %>%
        filter(lipid_type == 'TG') %>%
        select(eid, TG=mg_dl)
    ) %>%
    mutate(
        lipid_type = 'LDL_adjusted',
        mg_dl = case_when(
            mg_dl < 10 ~ NA_real_,
            TG > 400 ~ NA_real_,
            TRUE ~ mg_dl
        )
    ) %>%
    mutate(
        mg_dl = case_when(
            statin_use == 1 ~ mg_dl / 0.7,
            TRUE ~ mg_dl
        )
    )

In [None]:
# Add this new lipid type to our lipid phenotype.
lipid_phenotype <- rbind(
    lipid_phenotype,
    ldl_adjusted_phenotype %>% select(-TG)
)

## Adjust total cholesterol for statin use

In [None]:
# TC adjustment
# If STATIN is used, TC_ADJ = TC/0.8

tc_adjusted_phenotype <- lipid_phenotype %>%
    filter(lipid_type == 'TC') %>%
    mutate(
        lipid_type = 'TC_adjusted',
        mg_dl = case_when(
            statin_use == 1 ~ mg_dl / 0.8,
            TRUE ~ mg_dl
        )
    )

In [None]:
# Add this new lipid type to our lipid phenotype.
lipid_phenotype <- rbind(
    lipid_phenotype,
    tc_adjusted_phenotype
)

## Adjust triglycerides

In [None]:
# Triglyceride adjustment
# TG_ADJ = log(TG)

tg_adjusted_phenotype <- lipid_phenotype %>%
    filter(lipid_type == 'TG') %>%
    mutate(
        lipid_type = 'TG_adjusted',
        mg_dl = log(mg_dl)
    )

In [None]:
# Add this new lipid type to our lipid phenotype.
lipid_phenotype <- rbind(
    lipid_phenotype,
    tg_adjusted_phenotype
)

## Check adjusted lipids values

In [None]:
lipid_phenotype %>%
    group_by(lipid_type) %>%
    summarize(
        num_persons = n_distinct(eid),
        num_measures = n(),
        missing = sum(is.na(mg_dl)),
        median = median(mg_dl, na.rm = TRUE),
        mean = mean(mg_dl, na.rm = TRUE),
        stddev = sd(mg_dl, na.rm = TRUE)
    ) %>%
    arrange(desc(lipid_type))    

# Write phenotypes to workspace bucket

In [None]:
length(unique(lipid_phenotype$eid))

In [None]:
# Write the dataframe to a file.
write_csv(lipid_phenotype, UKB_PHENOTYPE_FILENAME)

In [None]:
# Copy the file to the workspace bucket.
system(str_glue('gsutil cp {UKB_PHENOTYPE_FILENAME} {DESTINATION}'), intern = T)

In [None]:
# Check the destination.
system(str_glue('gsutil ls -lh {DESTINATION}'), intern = T)

# Provenance

In [None]:
devtools::session_info()