# Prepare a lipids phenotype for a GWAS study

In this notebook we use the *All of Us* data to prepare a lipids phenotype for a GWAS study.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#). Specifically this is for the portion of the project that is the **siloed** analysis.

Much of the code here is SQL to take advantage of the scalability afforded by BigQuery when you use it not only to retrieve data but also to perform some of the analysis using many of its [analytical functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators).

TODO update project description link to biorxiv paper after it is posted.

# Setup 

In [None]:
library(bigrquery)
library(lubridate)
library(readxl)
library(tidyverse)

<div class="alert alert-block alert-info">
<b>Note:</b> The <a href='https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#with_clause'>WITH clause</a> allows you to logically sequence your code. It does this by allowing you to emulate temporary table names that are usable by your main SQL statement so that you can break your code into smaller and easier to understand queries that refer to one another.

In [None]:
formulate_and_run_multipart_query <- function(subqueries, final_tbl) {
    query <- str_c('WITH\n', str_c(subqueries, collapse = ',\n\n'), str_glue('\n\n\nSELECT * FROM {final_tbl}'))
    message(query)               
    results <- bq_table_download(bq_dataset_query(Sys.getenv('WORKSPACE_CDR'),
                                                  query,
                                                  billing = Sys.getenv('GOOGLE_PROJECT')),
                                 bigint = 'integer64')
    message(str_glue('Dimensions of result: num_rows={nrow(results)} num_cols={ncol(results)}'))
    return(results)
}          

# Cohort

In [None]:
COHORT_QUERY <- '
-- This query is the entire AoU cohort.
cohort_tbl AS (
    SELECT
        person.PERSON_ID as person_id,
    FROM
        `person` person
)'

In [None]:
genomic_cohort <- formulate_and_run_multipart_query(c(COHORT_QUERY), 'cohort_tbl')
head(genomic_cohort)

## TEMPORARY - work around lack of alpha3 cohort

In [None]:
ALPHA3_ID_LIST <- 'gs://fc-secure-440c511e-7fff-417c-9c86-f8ab51bfc618/data/potential_alpha3_wgs.xlsx'

In [None]:
system(str_glue('gsutil cp {ALPHA3_ID_LIST} .'), intern = TRUE)

In [None]:
alpha3_ids <- read_xlsx(basename(ALPHA3_ID_LIST))

dim(alpha3_ids)

# Demographics

In [None]:
DEMOGRAPHICS_QUERY <- str_glue('
-- This query represents dataset "Demographics for AoU WGS cohort" for domain "person" and was
-- generated for All of Us Controlled Tier Dataset v5 alpha and then further edited.
demographics_tbl AS (
    SELECT
        person.BIRTH_DATETIME as date_of_birth,
        person.PERSON_ID as person_id,
        p_race_concept.concept_name as race,
        p_gender_concept.concept_name as gender,
        p_ethnicity_concept.concept_name as ethnicity,
        p_sex_at_birth_concept.concept_name as sex_at_birth
    FROM
        `person` person
    LEFT JOIN `concept` p_race_concept on person.race_concept_id = p_race_concept.CONCEPT_ID
    LEFT JOIN `concept` p_gender_concept on person.gender_concept_id = p_gender_concept.CONCEPT_ID
    LEFT JOIN `concept` p_ethnicity_concept on person.ethnicity_concept_id = p_ethnicity_concept.CONCEPT_ID
    LEFT JOIN `concept` p_sex_at_birth_concept on person.sex_at_birth_concept_id = p_sex_at_birth_concept.CONCEPT_ID
    WHERE person.PERSON_ID IN (SELECT * FROM cohort_tbl)
)')

In [None]:
demographics <- formulate_and_run_multipart_query(c(COHORT_QUERY, DEMOGRAPHICS_QUERY), 'demographics_tbl')
head(demographics)

# Measurement data

<div class="alert alert-block alert-success">
    <b>Four steps:</b>
    <ol>
        <li>Look thought all relevant measurements and figure out which ones we want to include.</li>
        <li>Use readable names to programmatically obtain the concept ids.</li>
        <li>Now, limit to the most recent measurement per person and check the data again to make sure we still want to include it.</li>
        <li>Finally group the measurements together into their higher level concepts (e.g., LDL, HDL, TC, TG) and again limit to only the most recent measurement per person.</li>
    </ol>
</div>


## Discover the lipids measurements we wish to use

In [None]:
MEASUREMENT_OF_INTEREST <- '(?i)cholesterol|hdl|ldl|triglyceride'

LABS_OF_INTEREST_QUERY <- str_glue('
-- Use a case insensitive string to search the measurement concept names of those
-- measurements we do have in the measurements table.
labs_of_interest_tbl AS (
    SELECT
        measurement_concept_id,
        measurement_concept.concept_name AS measurement_name,
        unit_concept_id,
        unit_concept.concept_name AS unit_name
    FROM
        `measurement`
    LEFT JOIN `concept` AS measurement_concept ON measurement_concept.concept_id = measurement_concept_id
    LEFT JOIN `concept` AS unit_concept ON unit_concept.concept_id = unit_concept_id
    WHERE
        REGEXP_CONTAINS(measurement_concept.concept_name, r"(?i){MEASUREMENT_OF_INTEREST}")
    GROUP BY
        measurement_concept_id,
        unit_concept_id,
        measurement_concept.concept_name,
        unit_concept.concept_name
)')

In [None]:
MEASUREMENTS_OF_INTEREST_SUMMARY_QUERY <- '
-- Summarize the information about each measurement concept of interest that the prior query identified.
measurements_of_interest_summary_tbl AS (
    SELECT
        measurement_name AS measurement,
        IFNULL(unit_name, "NA") AS unit,
        COUNT(1) AS N,
        COUNTIF(value_as_number IS NULL
            AND (value_as_concept_id IS NULL
                OR value_as_concept_id = 0)) AS missing,
        MIN(value_as_number) AS min,
        MAX(value_as_number) AS max,
        AVG(value_as_number) AS avg,
        STDDEV(value_as_number) AS stddev,
        APPROX_QUANTILES(value_as_number, 4) AS quantiles,
        COUNTIF(value_as_number IS NOT NULL) AS num_numeric_values,
        COUNTIF(value_as_concept_id IS NOT NULL
                AND value_as_concept_id != 0) AS num_concept_values,
        COUNTIF(operator_concept_id IS NOT NULL) AS num_operators,
        IF(src_id = "PPI/PM", "PPI", "EHR") AS measurement_source,
        measurement_concept_id,
        unit_concept_id
    FROM
        `measurement`
    INNER JOIN
        labs_of_interest_tbl USING(measurement_concept_id, unit_concept_id)
    LEFT JOIN
        `measurement_ext` USING(measurement_id)
    WHERE
        person_id IN (SELECT * FROM cohort_tbl)
    GROUP BY
        measurement_concept_id,
        measurement_name,
        measurement_source,
        unit_concept_id,
        unit_name
    ORDER BY
        N DESC
)'    

In [None]:
measurements_of_interest_summary <- formulate_and_run_multipart_query(
    c(COHORT_QUERY, LABS_OF_INTEREST_QUERY, MEASUREMENTS_OF_INTEREST_SUMMARY_QUERY),
    'measurements_of_interest_summary_tbl')

In [None]:
head(measurements_of_interest_summary, 25)

<div class="alert alert-block alert-warning">
You can see above that much of the 'No matching concept' data above does appear to be in units of milligram per deciliter from the average, standard deviation, and quantiles.
</div>


## Retrieve concept ids by name

<div class="alert alert-block alert-success">
From the table above, identify which measures and units we want to include. Then use the readable names to programmatically obtain the concept ids.
</div>


In [None]:
UNIT_NAMES <- c('milligram per deciliter', 'No matching concept', 'mg/dL')

# HDL cholesterol
HDL_MEASURE_NAMES <- c('Cholesterol in HDL [Mass/volume] in Serum or Plasma',
                       'Cholesterol in HDL [Mass/volume] in Serum or Plasma by Electrophoresis',
                       'Cholesterol in HDL [Mass/volume] in Serum or Plasma ultracentrifugate')

# LDL cholesterol
LDL_MEASURE_NAMES <- c('Cholesterol in LDL [Mass/volume] in Serum or Plasma by calculation',
                       'Cholesterol in LDL [Mass/volume] in Serum or Plasma',
                       'Cholesterol in LDL [Mass/volume] in Serum or Plasma ultracentrifugate',
                       'Cholesterol in LDL [Mass/volume] in Serum or Plasma by Direct assay',
                       'Cholesterol in LDL [Mass/volume] in Serum or Plasma by Electrophoresis')

# Total cholesterol
TC_MEASURE_NAMES <- c('Cholesterol [Mass/volume] in Serum or Plasma')

# Triglyceriedes
TG_MEASURE_NAMES <- c('Triglyceride [Mass/volume] in Serum or Plasma',
                      'Triglyceride [Mass/volume] in Blood',
                      'Triglyceride [Mass/volume] in Serum or Plasma --fasting',
                      'Triglyceride [Mass/volume] in Serum or Plasma by calculation')

all_lipids_measure_names <- c(HDL_MEASURE_NAMES, LDL_MEASURE_NAMES, TC_MEASURE_NAMES, TG_MEASURE_NAMES)

lipids_measurement_summary <- measurements_of_interest_summary %>%
    filter(measurement %in% all_lipids_measure_names) %>%
    filter(unit %in% UNIT_NAMES) %>%
    mutate(
        lipid_type = case_when(
            measurement %in% HDL_MEASURE_NAMES ~ 'HDL',
            measurement %in% LDL_MEASURE_NAMES ~ 'LDL',
            measurement %in% TC_MEASURE_NAMES ~ 'TC',
            measurement %in% TG_MEASURE_NAMES ~ 'TG',
            # We won't have any in this final group, but adding it so that the value would not be NA in this case.
            TRUE ~ 'FIXME',
        )
    )

In [None]:
lipids_measurement_summary

### Create the vectors of concept ids we will use in queries

In [None]:
# Check for typos. Did we find all our measurement names?
stopifnot(length(unique(lipids_measurement_summary$measurement)) == length(all_lipids_measure_names))
(lipids_measurement_concept_ids <- sort(unique(lipids_measurement_summary$measurement_concept_id)))

# Check for typos. Did we find all our unit names?
stopifnot(length(unique(lipids_measurement_summary$unit)) == length(UNIT_NAMES))
(lipids_unit_concept_ids <- sort(unique(lipids_measurement_summary$unit_concept_id)))

In [None]:
(ldl_measurement_concept_ids <- lipids_measurement_summary %>%
     filter(lipid_type == 'LDL') %>%
     pull(measurement_concept_id) %>%
     unique %>%
     sort)
     
lipids_measurement_summary %>%
    filter(measurement_concept_id %in% ldl_measurement_concept_ids) %>%
    arrange(measurement_concept_id, desc(unit_concept_id)) %>%
    select(lipid_type, measurement, unit, measurement_concept_id)

In [None]:
(hdl_measurement_concept_ids <- lipids_measurement_summary %>%
     filter(lipid_type == 'HDL') %>%
     pull(measurement_concept_id) %>%
     unique %>%
     sort)
     
lipids_measurement_summary %>%
    filter(measurement_concept_id %in% hdl_measurement_concept_ids) %>%
    arrange(measurement_concept_id, desc(unit_concept_id)) %>%
    select(lipid_type, measurement, unit, measurement_concept_id)

In [None]:
(tc_measurement_concept_ids <- lipids_measurement_summary %>%
     filter(lipid_type == 'TC') %>%
     pull(measurement_concept_id) %>%
     unique %>%
     sort)
     
lipids_measurement_summary %>%
    filter(measurement_concept_id %in% tc_measurement_concept_ids) %>%
    arrange(measurement_concept_id, desc(unit_concept_id)) %>%
    select(lipid_type, measurement, unit, measurement_concept_id)

In [None]:
(tg_measurement_concept_ids <- lipids_measurement_summary %>%
     filter(lipid_type == 'TG') %>%
     pull(measurement_concept_id) %>%
     unique %>%
     sort)
     
lipids_measurement_summary %>%
    filter(measurement_concept_id %in% tg_measurement_concept_ids) %>%
    arrange(measurement_concept_id, desc(unit_concept_id)) %>%
    select(lipid_type, measurement, unit, measurement_concept_id)

## Most recent measurement per person by measurement concept

In [None]:
MEASUREMENTS_QUERY <- str_glue('
measurements_tbl AS (
-- Return row level data for certain measurements for members of the cohort.
    SELECT
        person_id,
        CONCAT(measurement_concept.concept_name, " [", unit_concept.concept_name, "]") AS title,
        measurement_id,
        measurement_concept_id,
        unit_concept_id,
        measurement_date,
        measurement_datetime,
        value_as_number,
        CASE
            WHEN measurement_concept_id IN ({str_c(ldl_measurement_concept_ids, collapse = ", ")}) THEN "LDL"
            WHEN measurement_concept_id IN ({str_c(hdl_measurement_concept_ids, collapse = ", ")}) THEN "HDL"
            WHEN measurement_concept_id IN ({str_c(tc_measurement_concept_ids, collapse = ", ")}) THEN "TC"
            WHEN measurement_concept_id IN ({str_c(tg_measurement_concept_ids, collapse = ", ")}) THEN "TG"
            ELSE "FIXME" -- We should not have any of these.
        END AS lipid_type
    FROM
        `measurement`
    LEFT JOIN `concept` AS measurement_concept
        ON measurement_concept.concept_id = measurement_concept_id
    LEFT JOIN `concept` AS unit_concept
        ON unit_concept.concept_id = unit_concept_id
    WHERE
        value_as_number IS NOT NULL
        AND value_as_number > 0
        AND measurement_concept_id IN ({str_c(lipids_measurement_concept_ids, collapse = ", ")})
        AND unit_concept_id IN ({str_c(lipids_unit_concept_ids, collapse = ", ")})
        AND person_id IN (SELECT * FROM cohort_tbl)
)')

In [None]:
RANNKED_MEASUREMENTS_QUERY <- str_glue('
ranked_measurements_tbl AS (
-- Add ranks to the row level data for certain measurements for members of the cohort.
    SELECT
        measurements_tbl.*,
        ROW_NUMBER() OVER (PARTITION BY person_id, measurement_concept_id, unit_concept_id
                           ORDER BY measurement_date DESC,
                                    measurement_datetime DESC,
                                    measurement_id DESC) AS recency_rank_by_concept,
        ROW_NUMBER() OVER (PARTITION BY person_id, lipid_type
                           ORDER BY measurement_date DESC,
                                    measurement_datetime DESC,
                                    measurement_id DESC) AS recency_rank_by_lipid_type

    FROM
        measurements_tbl
)')

In [None]:
MOST_RECENT_MEASUREMENT_PER_PERSON_BY_MEASUREMENT_CONCEPT_QUERY <- str_glue('
most_recent_measurement_per_person_by_measurement_concept_tbl AS (
-- Return row level data for certain measurements, limited to only the most recent result per concept
-- and per person in our cohort.
    SELECT
        *
    FROM
        ranked_measurements_tbl
    WHERE
        recency_rank_by_concept = 1
)')

In [None]:
most_recent_measurement_per_person_by_measurement_concept <- formulate_and_run_multipart_query(
    c(COHORT_QUERY,
      MEASUREMENTS_QUERY,
      RANNKED_MEASUREMENTS_QUERY,
      MOST_RECENT_MEASUREMENT_PER_PERSON_BY_MEASUREMENT_CONCEPT_QUERY),
    'most_recent_measurement_per_person_by_measurement_concept_tbl')

In [None]:
head(most_recent_measurement_per_person_by_measurement_concept)

In [None]:
most_recent_measurement_per_person_by_measurement_concept %>%
    group_by(title) %>%
    summarize(
        lipid_type = unique(lipid_type),
        num_persons = n_distinct(person_id),
        num_measures = n(),
        missing = sum(is.na(value_as_number)),
        median = median(value_as_number, na.rm = TRUE),
        mean = mean(value_as_number, na.rm = TRUE),
        stddev = sd(value_as_number, na.rm = TRUE)
    ) %>%
    arrange(lipid_type, desc(num_persons))    

In [None]:
most_recent_measurement_per_person_by_measurement_concept %>%
    group_by(lipid_type) %>%
    summarize(
        num_persons = n_distinct(person_id),
        num_measures = n(),
        missing = sum(is.na(value_as_number)),
        median = median(value_as_number, na.rm = TRUE),
        mean = mean(value_as_number, na.rm = TRUE),
        stddev = sd(value_as_number, na.rm = TRUE)
    ) %>%
    arrange(desc(num_persons))    

## Most recent measurement per person by lipid type

In [None]:
MOST_RECENT_MEASUREMENT_PER_PERSON_BY_LIPID_TYPE_QUERY <- '
most_recent_measurement_per_person_by_lipid_type_tbl AS (
-- Return row level data for certain measurements, limited to only the most recent result per lipid type
-- and per person in our cohort.
    SELECT
        *
    FROM
        ranked_measurements_tbl
    WHERE
        recency_rank_by_lipid_type = 1
)'

In [None]:
most_recent_measurement_per_person_by_lipid_type <- formulate_and_run_multipart_query(
    c(COHORT_QUERY,
      MEASUREMENTS_QUERY,
      RANNKED_MEASUREMENTS_QUERY,
      MOST_RECENT_MEASUREMENT_PER_PERSON_BY_LIPID_TYPE_QUERY),
    'most_recent_measurement_per_person_by_lipid_type_tbl')

In [None]:
head(most_recent_measurement_per_person_by_lipid_type)

In [None]:
most_recent_measurement_per_person_by_lipid_type %>%
    group_by(lipid_type) %>%
    summarize(
        num_persons = n_distinct(person_id),
        num_measures = n(),
        missing = sum(is.na(value_as_number)),
        median = median(value_as_number, na.rm = TRUE),
        mean = mean(value_as_number, na.rm = TRUE),
        stddev = sd(value_as_number, na.rm = TRUE)
    ) %>%
    arrange(desc(num_persons))    

# Statin Use

<div class="alert alert-block alert-success">
    <b>Three steps:</b>
    <ol>
        <li>Obtain the statin drug exposures for the members of our cohort.</li>
        <li>Many of the drug exposures do not have an end time. Fill those NAs by using the start time.</li>
        <li>Determine the outer bounds of the time interval over which the person has taken a statin. There could of course be gaps in the middle, but this is a reasonable estimate.</li>
    </ol>
</div>

In [None]:
STATIN_DRUG_EXPOSURES_QUERY <- '
-- This query represents dataset "Statin use for AoU WGS cohort" for domain "drug" and was generated 
-- for All of Us Controlled Tier Dataset v5 alpha and then further modified.
statin_drug_exposures_tbl AS (
  SELECT
    d_exposure.PERSON_ID AS person_id,
    d_exposure.DRUG_CONCEPT_ID AS drug_concept_id,
    d_exposure.DRUG_EXPOSURE_START_DATETIME AS exposure_start_datetime,
    d_exposure.DRUG_EXPOSURE_END_DATETIME AS exposure_end_datetime,
    d_standard_concept.concept_code AS standard_concept_code,
    d_standard_concept.concept_name AS standard_concept_name
  FROM (
    SELECT
        *
    FROM
        `drug_exposure` AS d_exposure
    WHERE
        ( drug_concept_id IN (
            SELECT
                DISTINCT ca.descendant_id
            FROM
                `cb_criteria_ancestor` AS ca
            JOIN (
                SELECT
                    DISTINCT c.concept_id
                FROM
                    `cb_criteria` c
                JOIN (
                    SELECT
                        CAST(cr.id AS string) AS id
                    FROM
                        `cb_criteria` cr
                    WHERE
                        domain_id = "DRUG"
                        AND is_standard = 1
                        AND concept_id IN ( 1545958, 1551860, 1549686, 1510813, 1592085, 40165636, 1539403, 1592180 )
                        AND is_selectable = 1
                        AND full_text LIKE "%[drug_rank1]%" ) a
                ON
                    ( c.path LIKE CONCAT("%.", a.id, ".%")
                        OR c.path LIKE CONCAT("%.", a.id)
                        OR c.path LIKE CONCAT(a.id, ".%")
                        OR c.path = a.id)
                WHERE
                    domain_id = "DRUG"
                    AND is_standard = 1
                    AND is_selectable = 1 ) b
            ON
                ( ca.ancestor_id = b.concept_id ) ) )
        AND ( d_exposure.PERSON_ID IN (
            SELECT
                person_id
            FROM
                `cb_search_person` AS cb_search_person
            WHERE
                cb_search_person.person_id IN ( SELECT * FROM cohort_tbl ) ) ) ) d_exposure
  LEFT JOIN
    `concept` d_standard_concept
  ON
    d_exposure.DRUG_CONCEPT_ID = d_standard_concept.CONCEPT_ID
)'

In [None]:
STATIN_DRUG_EXPOSURES_WITH_END_TIME_FILLED_QUERY <- '
-- Many drug exposure records do not have an end time. When that is the case, use the start time as the end time.
statin_drug_exposures_with_end_time_filled_tbl AS (
    SELECT
        person_id,
        drug_concept_id,
        exposure_start_datetime,
        IFNULL(exposure_end_datetime, exposure_start_datetime) AS exposure_end_datetime,
        standard_concept_code,
        standard_concept_name
    FROM
        statin_drug_exposures_tbl
)'

In [None]:
STATIN_DRUG_EXPOSURE_SUMMARY_PER_PERSON_QUERY <- '
-- Determine the outer bounds of the time interval over which the person has taken a statin. There could of course
-- be gaps in the middle, but this is a reasonable estimate.
statin_exposure_summary_per_person_tbl AS (
    SELECT
        person_id,
        MIN(exposure_start_datetime) AS statin_first_use_datetime,
        MAX(exposure_end_datetime) AS statin_last_use_datetime,
        COUNT(1) AS statin_drug_rx_count,
        COUNT(DISTINCT standard_concept_code) AS statin_drug_count,
        STRING_AGG(DISTINCT standard_concept_name, ", ") AS statin_drugs
    FROM 
        statin_drug_exposures_with_end_time_filled_tbl
    GROUP BY
        person_id
)'

In [None]:
statin_exposure_summary_per_person <- formulate_and_run_multipart_query(
    c(COHORT_QUERY,
      STATIN_DRUG_EXPOSURES_QUERY,
      STATIN_DRUG_EXPOSURES_WITH_END_TIME_FILLED_QUERY,
      STATIN_DRUG_EXPOSURE_SUMMARY_PER_PERSON_QUERY),
    'statin_exposure_summary_per_person_tbl')

In [None]:
statin_exposure_summary_per_person %>%
    arrange(desc(statin_drug_count)) %>%
    head(n = 20)

In [None]:
statin_exposure_summary_per_person %>%
    arrange(desc(statin_drug_count)) %>%
    tail(n = 20)

In [None]:
statin_exposure_summary_per_person %>%
    arrange(desc(statin_drug_rx_count)) %>%
    tail(n = 20)

# Now - bring all this data together in one query!

<div class="alert alert-block alert-success">
<p>Join our demographics, lipids measurements, and statin drug exposures together via a single query for scalability. <b>Note that</b>:<ul>
    <li>We keep the measurements on the left side of the join to allow for the null statin drug exposures we expect for the members of the cohort who do not take statins.</li>
    <li>The JOIN criteria for the lipids measurements and the statin drug intervals is whether the measurement time overlaps the drug exposure time interval.</li>
    <li>To do this same join in R, one can use R package <a href='https://cran.r-project.org/web/packages/fuzzyjoin/'>fuzzyjoin</a> but it tends to crash the kernel as data sizes increase.</li>
    </ul>
</div>


In [None]:
LIPID_PHENOTYPE_QUERY <- '
lipid_phenotype_tbl AS (
    SELECT
        demog.*,
        meas.* EXCEPT(person_id),
        statin_use.* EXCEPT(person_id)
    FROM
        most_recent_measurement_per_person_by_lipid_type_tbl AS meas
    LEFT JOIN
        demographics_tbl AS demog
    USING(person_id) 
    LEFT JOIN
        statin_exposure_summary_per_person_tbl AS statin_use
    ON
        meas.person_id = statin_use.person_id
        AND meas.measurement_datetime >= statin_first_use_datetime
        AND meas.measurement_datetime <= statin_last_use_datetime
)'

In [None]:
lipid_phenotype <- formulate_and_run_multipart_query(
    c(COHORT_QUERY,
      MEASUREMENTS_QUERY,
      RANNKED_MEASUREMENTS_QUERY,
      MOST_RECENT_MEASUREMENT_PER_PERSON_BY_LIPID_TYPE_QUERY,
      DEMOGRAPHICS_QUERY,
      STATIN_DRUG_EXPOSURES_QUERY,
      STATIN_DRUG_EXPOSURES_WITH_END_TIME_FILLED_QUERY,
      STATIN_DRUG_EXPOSURE_SUMMARY_PER_PERSON_QUERY,
      LIPID_PHENOTYPE_QUERY),
    'lipid_phenotype_tbl')

In [None]:
colnames(lipid_phenotype)

In [None]:
head(lipid_phenotype)

# Add statin indicator variable

In [None]:
lipid_phenotype <- lipid_phenotype %>%
    mutate(
        # Add an indicator for statin use.
        statin_use = !is.na(statin_first_use_datetime)
    )

# Add age covariates

In [None]:
lipid_phenotype <- lipid_phenotype %>%
    mutate(
        # Age at time of measurement.
        age = floor(interval(start = date_of_birth, end = measurement_date) / duration(num = 1, units = 'years')),
        age2 = age^2
    )

# Adjust lipids for statin use

## Adjust LDL for statin use

In [None]:
# LDL adjustment based on TG and LDL values
# If TG > 400, then LDL = NA
# If LDL < 10, then LDL = NA
# If STATIN is used, LDL_ADJ = LDL/0.7

ldl_adjusted_phenotype <- lipid_phenotype %>%
    filter(lipid_type == 'LDL') %>%
    left_join(
        lipid_phenotype %>%
        filter(lipid_type == 'TG') %>%
        select(person_id, TG=value_as_number)
    ) %>%
    mutate(
        lipid_type = 'LDL_adjusted',
        value_as_number = case_when(
            value_as_number < 10 ~ NA_real_,
            TG > 400 ~ NA_real_,
            TRUE ~ value_as_number
        )
    ) %>%
    mutate(
        value_as_number = case_when(
            statin_use ~ value_as_number / 0.7,
            TRUE ~ value_as_number
        )
    )

In [None]:
# Add this new lipid type to our lipid phenotype.
lipid_phenotype <- rbind(
    lipid_phenotype,
    ldl_adjusted_phenotype %>% select(-TG)
)

## Adjust total cholesterol for statin use

In [None]:
# TC adjustment
# If STATIN is used, TC_ADJ = TC/0.8

tc_adjusted_phenotype <- lipid_phenotype %>%
    filter(lipid_type == 'TC') %>%
    mutate(
        lipid_type = 'TC_adjusted',
        value_as_number = case_when(
            statin_use ~ value_as_number / 0.8,
            TRUE ~ value_as_number
        )
    )

In [None]:
# Add this new lipid type to our lipid phenotype.
lipid_phenotype <- rbind(
    lipid_phenotype,
    tc_adjusted_phenotype
)

## Adjust triglycerides

In [None]:
# Triglyceride adjustment
# TG_ADJ = log(TG)

tg_adjusted_phenotype <- lipid_phenotype %>%
    filter(lipid_type == 'TG') %>%
    mutate(
        lipid_type = 'TG_adjusted',
        value_as_number = log(value_as_number)
    )

In [None]:
# Add this new lipid type to our lipid phenotype.
lipid_phenotype <- rbind(
    lipid_phenotype,
    tg_adjusted_phenotype
)

# Write phenotypes to workspace bucket

In [None]:
head(lipid_phenotype)

In [None]:
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/phenotypes/{DATESTAMP}/')

print(str_glue('Phenotype files will be written to folder {DESTINATION}.'))

In [None]:
write_phenotype_to_bucket <- function(my_dataframe, destination_filename) {
    # Store the dataframe in current workspace.
    write_excel_csv(my_dataframe, destination_filename)

    # Copy the file from current workspace to the bucket.
    system(str_glue('gsutil cp ./ {destination_filename} {DESTINATION}'), intern = T)

    # Check if file is in the bucket.
    system(str_glue('gsutil ls {DESTINATION}'), intern = T)
}

In [None]:
#write_phenotype_to_bucket(lipid_phenotype, 'AOU_lipids_phenotype.csv')

## TEMPORARY - work around lack of alpha3 cohort

In [None]:
lipid_phenotype_alpha3 <- lipid_phenotype %>%
    filter(as.double(person_id) %in% as.double(alpha3_ids$person_id))

dim(lipid_phenotype_alpha3)

In [None]:
length(unique(lipid_phenotype_alpha3$person_id))

In [None]:
write_phenotype_to_bucket(lipid_phenotype_alpha3, 'AOU_lipids_phenotype_alpha3.csv')

# Provenance

In [None]:
devtools::session_info()