# Antidepressant GWAS Phenotypes and Covariates

Datasets were assembled in the AllofUs Workspace and then exported as queries.

In [2]:
library(tidyverse)
library(bigrquery)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


## Queries

Get workspace variables and copy the queries to the local workspace.

In [8]:
bucket <- Sys.getenv("WORKSPACE_BUCKET")
dataset <- Sys.getenv("WORKSPACE_CDR")
billing <- Sys.getenv("GOOGLE_PROJECT")
owner <- Sys.getenv("OWNER_EMAIL")
system(str_glue("gsutil -m cp {bucket}/queries/*.sql ."))

In [9]:
sql_paths <- list.files(".", "*.sql")
names(sql_paths) <- sapply(str_split(sql_paths, "\\."), first)
sql_paths

Read SQL query from file, construct CSV output path and URI, perform query, and write out dataset.

In [5]:
for(query_name in names(sql_paths)) {
    query_sql <- read_file(sql_paths[query_name])
    query_csv <- str_glue("{query_name}.csv")
    bucket_query_csv_uri <- str_glue("{bucket}/bq_exports/{owner}/{query_csv}")
    query_tb <- bq_dataset_query(dataset, query = query_sql, billing = billing)
    bq_table_save(query_tb, destination_uris = bucket_query_csv_uri, destination_format = "CSV")
}

Copy saved CSV files into local workspace.

In [29]:
system(str_glue("gsutil -m cp {bucket}/bq_exports/{owner}/*.csv ."))

In [30]:
list.files(".", "*.csv")

Read in datasets

In [31]:
genotyped_ehr <- read_csv("genotyped-ehr.csv")
atc_n06a <- read_csv("genotyped-atc-n06a.csv")
atc_n06aa <- read_csv("genotyped-atc-n06aa.csv")
atc_n06ab <- read_csv("genotyped-atc-n06ab.csv")

[1mRows: [22m[34m206173[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): gender, date_of_birth
[32mdbl[39m (1): person_id

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m2349104[39m [1mColumns: [22m[34m6[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): standard_concept_name, standard_vocabulary, drug_exposure_start_dat...
[32mdbl[39m (3): person_id, drug_concept_id, standard_concept_code

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m228304[

## Covariates

Get provided ancestry cluster PCs

In [26]:
system(str_glue("gsutil -m -u {billing} cp gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv ."))

In [27]:
ancestry_preds <- read_tsv("ancestry_preds.tsv")

[1mRows: [22m[34m245394[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): ancestry_pred, probabilities, pca_features, ancestry_pred_other
[32mdbl[39m (1): research_id

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Parse out PCs from `pca_features` column (comma separated in brackets)

In [42]:
pcs_aux <- ancestry_preds |>
    transmute(research_id, pca_features = str_sub(pca_features, start = 2, end = -2)) |>
    separate_longer_delim(pca_features, delim = ",") |>
    transmute(research_id,
              pc_value = as.numeric(pca_features)) |>
    group_by(research_id) |>
    mutate(PC = row_number()) |>
    pivot_wider(names_prefix = "PC", names_from = PC, values_from = pc_value)

Get PCs calculated from within each cluster (from `Ancestry cluster PCs.ipynb`):

In [32]:
system(str_glue("gsutil -m cp {bucket}/pcs/*.sscore ."))

In [34]:
sscore_paths <- list.files(".", "*.sscore",)
sscores <- bind_rows(lapply(sscore_paths, read_table))


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
cols(
  `#FID` = [32mcol_double()[39m,
  IID = [32mcol_double()[39m,
  ALLELE_CT = [32mcol_double()[39m,
  NAMED_ALLELE_DOSAGE_SUM = [32mcol_double()[39m,
  PC1_AVG = [32mcol_double()[39m,
  PC2_AVG = [32mcol_double()[39m,
  PC3_AVG = [32mcol_double()[39m,
  PC4_AVG = [32mcol_double()[39m,
  PC5_AVG = [32mcol_double()[39m,
  PC6_AVG = [32mcol_double()[39m,
  PC7_AVG = [32mcol_double()[39m,
  PC8_AVG = [32mcol_double()[39m,
  PC9_AVG = [32mcol_double()[39m,
  PC10_AVG = [32mcol_double()[39m
)


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
cols(
  `#FID` = [32mcol_double()[39m,
  IID = [32mcol_double()[39m,
  ALLELE_CT = [32mcol_double()[39m,
  NAMED_ALLELE_DOSAGE_SUM = [32mcol_double()[39m,
  PC1_AVG = [32mcol_double()[39m,
  PC2_AVG = [32mcol_double()[39m,
  PC3_AVG = [32m

In [44]:
covariates_aux <- genotyped_ehr |>
    inner_join(pcs_aux, by = c("person_id" = "research_id")) |>
    select(FID = person_id, IID = person_id,
              gender, starts_with("PC")) |>
    mutate(gender = if_else(gender %in% c("Female", "Male"), true = gender, false = "Gender")) |>
    mutate(across(PC1:PC10, ~ scale(.x)[,1]))
write_tsv(covariates_aux, "atc_antidep_aux.covar")

In [45]:
covariates <- genotyped_ehr |>
    inner_join(sscores, by = c("person_id" = "IID")) |>
    select(FID = person_id, IID = person_id,
              gender, starts_with("PC")) |>
    mutate(gender = if_else(gender %in% c("Female", "Male"), true = gender, false = "Gender")) |>
    mutate(across(PC1_AVG:PC10_AVG, ~ scale(.x)[,1]))
write_tsv(covariates, "atc_antidep.covar")

In [48]:
nrow(covariates_aux)

## Phenotypes

In [10]:
phenotypes <- genotyped_ehr |>
    transmute(FID = person_id, IID = person_id,
             N06A = if_else(person_id %in% pull(atc_n06a, person_id), true = 1, false = 0),
             N06AA = if_else(person_id %in% pull(atc_n06aa, person_id), true = 1, false = 0),
             N06AB = if_else(person_id %in% pull(atc_n06ab, person_id), true = 1, false = 0))

In [11]:
phenotypes |> count(N06A, N06AA, N06AB)

N06A,N06AA,N06AB,n
<dbl>,<dbl>,<dbl>,<int>
0,0,0,139379
1,0,0,16130
1,0,1,33752
1,1,0,8024
1,1,1,8888


In [12]:
write_tsv(phenotypes, "atc_antidep.pheno")

In [46]:
system(str_glue("gsutil -m cp atc_antidep.* {bucket}/inputs/"))