In [None]:
lapply(c('viridis', 'ggthemes', 'skimr', 'fuzzyjoin', 'pryr'),
       function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )
lapply(c('IRanges'),
       function(pkg_name) { if(! pkg_name %in% installed.packages()) { BiocManager::install(pkg_name)} } )

library(viridis)    # A nice color scheme for plots.
library(ggthemes)   # Common themes to change the look and feel of plots.
library(scales)     # Graphical scales map data to aesthetics in plots.
library(skimr)      # Better summaries of data.
library(lubridate)  # Date library from the tidyverse.
library(bigrquery)  # BigQuery R client.
library(tidyverse)  # Data wrangling packages.
library(fuzzyjoin)
library(lubridate)
library(pryr)       # For memory profiling.

In [None]:
## BigQuery setup.
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')
# Get the BigQuery curated dataset for the current workspace context.
CDR <- Sys.getenv('WORKSPACE_CDR')

WORKSPACE_BUCKET <- Sys.getenv('WORKSPACE_BUCKET', unset = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb')

## Plot setup.
theme_set(theme_bw(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

# AOU Data

## Retrieve Datasets

### Retrieve Alpha 2 Release data

In [None]:
participants_with_genomic_data <- read_csv(
    pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/researchIDsAlpha2Release_04272021.txt')),
    col_names = c('person_id')
)

dim(participants_with_genomic_data)

### Retrieve most recent lipids measurements

In [None]:
most_recent_lipids_measurements_df <- read_csv(
    pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/most_recent_lipids_measurements.csv')))
dim(most_recent_lipids_measurements_df)

In [None]:
head(most_recent_lipids_measurements_df)
length(grep("HDL", most_recent_lipids_measurements_df$title))

### Retrieve statin drug exposures summarized per person

In [None]:
statin_use_summary_df <- read_csv(
    pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/participants_with_statin_use_summary.csv')),
    col_types = cols(
        statin_first_use = col_datetime(format = '%Y/%m/%d %H:%M:%S'),
        statin_last_use = col_datetime(format = '%Y/%m/%d %H:%M:%S')),
    guess_max = 25000
)

dim(statin_use_summary_df)

In [None]:
dim(statin_use_summary_df)
head(statin_use_summary_df)

## Lipid Adjustment

### Create grouped dataframe

In [None]:
most_recent_lipids_measurements_df %>%
    # Limit the input data frames to hold only the AoU participants with genomic data.
    filter(person_id %in% participants_with_genomic_data$person_id) %>%
    group_by(title) %>%
    summarize(
        num_alpha2_participants = n_distinct(person_id),
        num_measures = n(),
        missing = sum(is.na(value_as_number)),
        median = median(value_as_number, na.rm = TRUE),
        mean = mean(value_as_number, na.rm = TRUE),
        stddev = sd(value_as_number, na.rm = TRUE)
    ) %>%
    arrange(desc(num_alpha2_participants))

In [None]:
most_recent_lipids_measurements_df_grouped <- most_recent_lipids_measurements_df 
dim(most_recent_lipids_measurements_df_grouped)
head(most_recent_lipids_measurements_df_grouped)

In [None]:
most_recent_lipids_measurements_df_grouped <- most_recent_lipids_measurements_df 

temp <- grep("HDL", most_recent_lipids_measurements_df_grouped$title)
most_recent_lipids_measurements_df_grouped$title[temp] <- "HDL"
temp <- grep("LDL", most_recent_lipids_measurements_df_grouped$title)
most_recent_lipids_measurements_df_grouped$title[temp] <- "LDL"
temp <- grep("Triglyceride", most_recent_lipids_measurements_df_grouped$title)
most_recent_lipids_measurements_df_grouped$title[temp] <- "TG"
temp <- grep("Cholesterol", most_recent_lipids_measurements_df_grouped$title)
most_recent_lipids_measurements_df_grouped$title[temp] <- "TC"

head(most_recent_lipids_measurements_df_grouped)

In [None]:
Lipid_data <- most_recent_lipids_measurements_df_grouped %>%
    group_by(person_id, title) %>%
    arrange(desc(measurement_date)) %>%
    filter(row_number()==1)
most_recent_lipids_measurements_df_grouped <- Lipid_data

In [None]:
most_recent_lipids_measurements_df_grouped %>%
    # Limit the input data frames to hold only the AoU participants with genomic data.
    filter(person_id %in% participants_with_genomic_data$person_id) %>%
    group_by(title) %>%
    summarize(
        num_alpha2_participants = n_distinct(person_id),
        num_measures = n(),
        missing = sum(is.na(value_as_number)),
        median = median(value_as_number, na.rm = TRUE),
        mean = mean(value_as_number, na.rm = TRUE),
        stddev = sd(value_as_number, na.rm = TRUE)
    ) %>%
    arrange(desc(num_alpha2_participants))

In [None]:
head(most_recent_lipids_measurements_df_grouped)
table(most_recent_lipids_measurements_df_grouped$sex_at_birth)

### Functions to individual dataframe for each lipid class and write file

In [None]:
writeFile <- function(FileToSave, FileName) {
    # This code saves your dataframe into a csv file in a "data" folder in Google Bucket
    my_dataframe <- FileToSave
    destination_filename <- FileName

    # store the dataframe in current workspace
    write_excel_csv(my_dataframe, destination_filename)

    # Get the bucket name
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

    # Copy the file from current workspace to the bucket
    system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

    # Check if file is in the bucket
    system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)
}

In [None]:
# http://adv-r.had.co.nz/memory.html
mem_used()

In [None]:
# CHANGED function
#  Some of the transformations in the original function only need to be 
#  performed once on the input data. I moved that code out of your function.


most_recent_lipids_measurements_df_grouped <- most_recent_lipids_measurements_df_grouped %>%
    # Limit the input data frames to hold only the AoU participants with genomic data.
    filter(person_id %in% participants_with_genomic_data$person_id) %>%
    mutate(
        # Also convert datetimes to dates so that the join is faster.
        date = as_date(measurement_date),
        birthDate = as_date(birth_datetime),
        # TODO(margaret) I would think you want age at time of measurement, not age as of today?
        # If so, use field 'measurement_date' instead of 'Sys.Date()'. --- Done(margaret)
        age_birth = as.numeric(difftime(Sys.Date(), birthDate, units = "weeks")) / 52.25,
        age_date = as.numeric(difftime(Sys.Date(), date, units = "weeks")) / 52.25,
        age = age_birth-age_date,
        age2 = age^2
   )

statin_use_summary_df <- statin_use_summary_df %>%
    # Limit the input data frames to hold only the AoU participants with genomic data.
    filter(PERSON_ID %in% participants_with_genomic_data$person_id) %>%
    mutate(
        # Also convert datetimes to dates so that the join is faster.
        statin_first_use_date = as_date(statin_first_use),
        statin_last_use_date = as_date(statin_last_use),
        # Add an indicator for statin use.
        statin_use = !is.na(statin_first_use)
    )

# Create a function to get individual dataframe for each lipid class
SummariseLipids <- function(TitleUsed) {
    start_time <- Sys.time()

    exposure_summary <- most_recent_lipids_measurements_df_grouped %>%
    filter(title == TitleUsed) %>%
    fuzzy_left_join(
        statin_use_summary_df,
        by = c('person_id' = 'PERSON_ID',
               'date' = 'statin_first_use_date',
               'date' = 'statin_last_use_date'),
        match_fun = list(`==`, `>=`, `<=`)
    )

    print(dim(exposure_summary))
     
    end_time <- Sys.time()
    print(end_time - start_time)
    
    writeFile(exposure_summary, paste0(TitleUsed,"_ForAdjNor.20210714.csv"))
    
    return(exposure_summary)
}

In [None]:
mem_used()

In [None]:
head(most_recent_lipids_measurements_df_grouped)

### Get each Lipid data separately and write it as .csv for furture use

In [None]:
# Carry out this step once so that the file is saved as .csv
mem_used()
LDL <- SummariseLipids('LDL')
mem_used()

In [None]:
dim(LDL)
head(LDL)

In [None]:
mem_used()
HDL <- SummariseLipids('HDL')
mem_used()

In [None]:
mem_used()
TC <- SummariseLipids('TC')
mem_used()

In [None]:
mem_used()
TG <- SummariseLipids('TG')
mem_used()

# UKB data

## Retrieve Datasets

### Retrieve eid to sample id mapping

In [None]:
# TODO(margaret) replace this path with the path to the correct file for the WES data. --- Done (margaret)
UKB_EID_TO_SAMPLE_ID <- 'gs://uk-biobank-sek-data-us-east1/sample-info/bridge_7089_31063.tsv'

In [None]:
system(str_glue('gsutil cat {UKB_EID_TO_SAMPLE_ID} | head'), intern = TRUE)

In [None]:
ukb_eid_to_sample_id <- read_delim(
    pipe(str_glue('gsutil cat {UKB_EID_TO_SAMPLE_ID}')),
    delim = '\t'
)

dim(ukb_eid_to_sample_id)
head(ukb_eid_to_sample_id)

### Retrieve Raw data with multiple phenotype related columns

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_pheno_raw_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'raw_phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids_pheno_raw')
dim(natarajan_pheno_raw_tbl)
head(colnames(natarajan_pheno_raw_tbl))

### Retrieve Lipid data with adjustment and medication related columns

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_lipids_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids')
dim(natarajan_lipids_tbl)
head(colnames(natarajan_lipids_tbl))

In [None]:
colnames(natarajan_lipids_tbl)

### Curate only needed columns

In [None]:
# Curate Raw table with required columns
colnames(natarajan_pheno_raw_tbl)[c(1,3,4,43,64)]
pheno_Raw <- natarajan_pheno_raw_tbl %>%
    select(id, Sex_numeric, age, genotyping_array, in_white_British_ancestry_subset) %>%
    collect()

dim(pheno_Raw)
head(as.data.frame(pheno_Raw))

In [None]:
# Curate Lipids table with required columns

colnames(natarajan_lipids_tbl)[c(1,2,3,4,5,10,37,38,39,40,41)]
pheno_Lipid <- natarajan_lipids_tbl %>%
    select(eid, ldl, hdl, trig, chol, statin0, ldladj, choladj, trigadj, hdladj, TG_LOG) %>%
    collect()

dim(pheno_Lipid)
head(as.data.frame(pheno_Lipid))

In [None]:
table(pheno_Lipid$statin0)

In [None]:
# Match the Raw table
Raw500K <- pheno_Raw[match(pheno_Lipid$eid, pheno_Raw$id),]
Raw500K$age2 <- Raw500K$age^2
dim(Raw500K)
head(Raw500K)

In [None]:
# Combine Raw and Lpids table
Lipids500K <- as.data.frame(cbind(pheno_Lipid, Raw500K))
dim(Lipids500K)
head(Lipids500K)

summary(Lipids500K$hdl)
summary(Lipids500K$trigadj)
summary(Lipids500K$TG_LOG)


Lipids500K_NAomitted <- na.omit(Lipids500K)
dim(Lipids500K_NAomitted)

### Match it to the linker file

In [None]:
MappedIds <- ukb_eid_to_sample_id[match(Lipids500K_NAomitted$eid, ukb_eid_to_sample_id$eid_7089), 2]
head(MappedIds)
dim(MappedIds)

Lipids500K_NAomitted <- cbind(Lipids500K_NAomitted, MappedIds)
Lipids500K_NAomitted <- as.data.frame(Lipids500K_NAomitted)
dim(Lipids500K_NAomitted)

colnames(Lipids500K_NAomitted)[18] <- "eid_WES"
head(Lipids500K_NAomitted)

### Write the UKB NAomitted file for future use

In [None]:
writeFile(Lipids500K_NAomitted, "UKB_NAomitted_Data.csv")

# Iteration 1 Data

In [None]:
# Below steps are carried out for generation of Iteration 1 - ***NOT USED***
# For Iteration2 - check AOU_UKB_phenotype_refined notebook

# Read all the files
LDL <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/LDL_ForAdjNor.20210714.csv')))
HDL <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/HDL_ForAdjNor.20210714.csv')))
TC <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/TC_ForAdjNor.20210714.csv')))
TG <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/TG_ForAdjNor.20210714.csv')))

### Retrieve the Common Samples with no NA values - Dataset1 - Iteration required to get more samples

In [None]:
# Common Samples
CommSamples <- Reduce(intersect, list(LDL$person_id,HDL$person_id,TC$person_id,TG$person_id))
length(CommSamples)

In [None]:
LDLmat <- match(CommSamples, LDL$person_id)
HDLmat <- match(CommSamples, HDL$person_id)
TCmat <- match(CommSamples, TC$person_id)
TGmat <- match(CommSamples, TG$person_id)

Colnames <- c("person_id", "gender", "age", "age2", "statin_use", "title", "value_as_number")
Colnames1 <- c("title", "value_as_number")
ColUsed <- match(Colnames, colnames(LDL))
ColUsed1 <- match(Colnames1, colnames(LDL))

CombinedMat <- cbind(LDL[LDLmat,ColUsed], HDL[HDLmat,ColUsed1], TC[TCmat,ColUsed1], TG[TGmat,ColUsed1])

dim(CombinedMat)
colnames(CombinedMat)[c(7,9,11,13)] <- c("LDL", "HDL", "TC", "TG")
CombinedMat[1:2, ]

In [None]:
#LDL adjustment based on TG/LDL values
#If TG > 400, then LDL = NA
#If LDL < 10, then LDL=NA

CombinedMat$LDL <- ifelse(CombinedMat$TG > 400, NA, CombinedMat$LDL)
CombinedMat$LDL <- ifelse(CombinedMat$LDL < 10, NA, CombinedMat$LDL)
#If STATIN is used, LDL_ADJ = LDL/0.7
CombinedMat$LDLadjusted <- ifelse(CombinedMat$statin_use == "TRUE", CombinedMat$LDL/0.7, CombinedMat$LDL)
#If STATIN is used, TOTAL_ADJ = TC/0.8
CombinedMat$TCadjusted <- ifelse(CombinedMat$statin_use == "TRUE", CombinedMat$TC/0.8, CombinedMat$TC)
#TG adjustment
CombinedMat$TGadjusted <- log(CombinedMat$TG)

In [None]:
# Removing NA samples --- Iteration1
head(CombinedMat)
dim(CombinedMat)

CombinedMat_NAomitted <- na.omit(CombinedMat)
dim(na.omit(CombinedMat_NAomitted))


In [None]:
# Creating separate phenofiles to resolve the issue of less sample numbers because of NAs --- Iteration2
Colnames_LDL <- c("person_id", "gender", "age", "age2", "statin_use", "LDL", "LDLadjusted")
Colnames_HDL <- c("person_id", "gender", "age", "age2", "statin_use", "HDL")
Colnames_TC <- c("person_id", "gender", "age", "age2", "statin_use", "TC", "TCadjusted")
Colnames_TG <- c("person_id", "gender", "age", "age2", "statin_use", "TG", "TGadjusted")

CombinedMat_LDL <- CombinedMat[ ,match(Colnames_LDL, colnames(CombinedMat))]
CombinedMat_HDL <- CombinedMat[ ,match(Colnames_HDL, colnames(CombinedMat))]
CombinedMat_TC <- CombinedMat[ ,match(Colnames_TC, colnames(CombinedMat))]
CombinedMat_TG <- CombinedMat[ ,match(Colnames_TG, colnames(CombinedMat))]

# Most of the NA columns are present for LDL/TC phenotype
length(CombinedMat_LDL$LDLadjusted)
length(na.omit(CombinedMat_LDL$LDLadjusted))
length(na.omit(CombinedMat_TC$TCadjusted))
length(na.omit(CombinedMat_TG$TGadjusted))
length(na.omit(CombinedMat_HDL$HDL))


head(CombinedMat_LDL)
head(CombinedMat_TG)

In [None]:
writeFile(CombinedMat_NAomitted, "AOU_NAomitted_Data.csv")

# PCS - UKB & AOU

In [None]:
raw_pcs <- readr::read_tsv(
    pipe(str_glue('gsutil cat gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210714/pcs.tsv'))
)
raw_pcs$person_id <- raw_pcs$s
head(raw_pcs)

In [None]:
pcs <- raw_pcs %>%
    extract(col = scores,
            into = c('pcs'),
            regex = '([^\\[\\]]+)') %>%
    separate(col = pcs,
             into = c('pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10'),
             sep = ',')
head(pcs)

In [None]:
dim(pcs)
head(pcs)

In [None]:
# No.of AOU samples in the PC matrix
aou_pcs <- pcs %>% filter(cohort == 'aou')
length(intersect(aou_pcs$s, participants_with_genomic_data$person_id))
length(intersect(aou_pcs$s, CombinedMat$person_id))
length(intersect(aou_pcs$s, CombinedMat_NAomitted$person_id))

In [None]:
# No.of UKB samples in the PC matrix
ukb_pcs <- pcs %>% filter(cohort == 'ukb')
colnames(ukb_pcs)[1] <- "eid_WES"
head(ukb_pcs)
# The ukb_eid_to_sample_id dataframe is currently filled in with sample ids from the array data, not the WES data.
length(intersect(ukb_pcs$person_id, ukb_eid_to_sample_id$eid_7089))
length(intersect(ukb_pcs$person_id, ukb_eid_to_sample_id$eid_31063))
length(intersect(ukb_pcs$person_id, pheno_Raw$id))
length(intersect(ukb_pcs$person_id, Lipids500K$eid))
length(intersect(ukb_pcs$person_id, Lipids500K_NAomitted$eid))

# Merge AOU and UKB data to Common PC matrix

## UKB/AOU PC merge

In [None]:
UKBpcs <- ukb_pcs[na.omit(match(Lipids500K_NAomitted$eid_WES, ukb_pcs$eid_WES)), ]
AOUpcs <- aou_pcs[na.omit(match(CombinedMat_NAomitted$person_id, aou_pcs$person_id)), ]


dim(UKBpcs)
dim(AOUpcs)

In [None]:
head(UKBpcs)
head(AOUpcs)

head(Lipids500K_NAomitted)
head(CombinedMat_NAomitted)

### Combine AOU/UKB data with PCs

In [None]:
UKBdata <- merge(Lipids500K_NAomitted, UKBpcs, by="eid_WES")
AOUdata <- merge(CombinedMat_NAomitted,AOUpcs, by="person_id")

In [None]:
dim(UKBdata)
dim(AOUdata)

colnames(UKBdata)
colnames(AOUdata)

head(UKBdata)
head(AOUdata)

### Write AOU/UKB data with PCs

In [None]:
writeFile(UKBdata, "UKBdata.csv")
writeFile(AOUdata, "AOUdata.csv")

# Normalization

In [None]:
UKBdata <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/UKBdata.csv')))
AOUdata <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/AOUdata.csv')))

dim(UKBdata)
dim(AOUdata)

## UKB

In [None]:
# Normalized Phenotypes
UKBdata$ldladj.resid <- resid(lm(ldladj ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKBdata))
UKBdata$hdladj.resid <- resid(lm(hdladj ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKBdata))
UKBdata$choladj.resid <- resid(lm(choladj ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKBdata))
UKBdata$trigadj.resid <- resid(lm(TG_LOG ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKBdata))

UKBdata$ldladj.norm <- sd(UKBdata$ldladj)*scale(qnorm((rank(UKBdata$ldladj.resid,na.last="keep")-0.5)/length(UKBdata$ldladj.resid)))
UKBdata$hdladj.norm <- sd(UKBdata$hdladj)*scale(qnorm((rank(UKBdata$hdladj.resid,na.last="keep")-0.5)/length(UKBdata$hdladj.resid)))
UKBdata$choladj.norm <- sd(UKBdata$choladj)*scale(qnorm((rank(UKBdata$choladj.resid,na.last="keep")-0.5)/length(UKBdata$choladj.resid)))
UKBdata$trigadj.norm <- sd(UKBdata$TG_LOG)*scale(qnorm((rank(UKBdata$trigadj.resid,na.last="keep")-0.5)/length(UKBdata$trigadj.resid)))

UKBdata$CohortName <- rep("UKB", nrow(UKBdata))

In [None]:
dim(UKBdata)
head(UKBdata)
colnames(UKBdata)

## AOU

In [None]:
# Normalized Phenotypes
AOUdata$ldladj.resid <- resid(lm(LDLadjusted ~ gender+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOUdata))
AOUdata$hdladj.resid <- resid(lm(HDL ~ gender+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOUdata))
AOUdata$choladj.resid <- resid(lm(TCadjusted ~ gender+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOUdata))
AOUdata$trigadj.resid <- resid(lm(TGadjusted ~ gender+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOUdata))

AOUdata$ldladj.norm <- sd(AOUdata$LDLadjusted)*scale(qnorm((rank(AOUdata$ldladj.resid,na.last="keep")-0.5)/length(AOUdata$ldladj.resid)))
AOUdata$hdladj.norm <- sd(AOUdata$HDL)*scale(qnorm((rank(AOUdata$hdladj.resid,na.last="keep")-0.5)/length(AOUdata$hdladj.resid)))
AOUdata$choladj.norm <- sd(AOUdata$TCadjusted)*scale(qnorm((rank(AOUdata$choladj.resid,na.last="keep")-0.5)/length(AOUdata$choladj.resid)))
AOUdata$trigadj.norm <- sd(AOUdata$TGadjusted)*scale(qnorm((rank(AOUdata$trigadj.resid,na.last="keep")-0.5)/length(AOUdata$trigadj.resid)))

AOUdata$CohortName <- rep("AOU", nrow(AOUdata))

In [None]:
dim(AOUdata)
head(AOUdata)
colnames(AOUdata)

## Merge UKB & AOU together with required covariates

In [None]:
AOUcolsrequired <- c("person_id", "LDL", "HDL", "TC", "TG", "ldladj.norm", "hdladj.norm", "choladj.norm", "trigadj.norm", "gender", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "CohortName")
AOU_Data_Iteration1_ForGWAS <- AOUdata[,match(AOUcolsrequired, colnames(AOUdata))]
dim(AOU_Data_Iteration1_ForGWAS)
colnames(AOU_Data_Iteration1_ForGWAS) <- c("id", "LDL_raw", "HDL_raw", "TC_raw", "TG_raw", "LDL_norm", "HDL_norm", "TC_norm", "TG_norm", "gender", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "CohortName")


In [None]:
UKBcolsrequired <- c("eid", "ldl", "hdl", "chol", "trig", "ldladj.norm", "hdladj.norm", "choladj.norm", "trigadj.norm", "Sex_numeric", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "CohortName")
UKB_Data_Iteration1_ForGWAS <- UKBdata[,match(UKBcolsrequired, colnames(UKBdata))]
dim(UKB_Data_Iteration1_ForGWAS)
colnames(UKB_Data_Iteration1_ForGWAS) <- c("id", "LDL_raw", "HDL_raw", "TC_raw", "TG_raw", "LDL_norm", "HDL_norm", "TC_norm", "TG_norm", "gender", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "CohortName")


In [None]:
FULL_Data_Iteration1_ForGWAS <- rbind(AOU_Data_Iteration1_ForGWAS, UKB_Data_Iteration1_ForGWAS)
dim(FULL_Data_Iteration1_ForGWAS)

In [None]:
table(FULL_Data_Iteration1_ForGWAS$CohortName)
summary(FULL_Data_Iteration1_ForGWAS$HDL_norm)
summary(FULL_Data_Iteration1_ForGWAS$LDL_norm)


In [None]:
writeFile(FULL_Data_Iteration1_ForGWAS, "FULL_Data_Iteration1_ForGWAS.csv")

In [None]:
FULL_Data_Iteration1_ForGWAS <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/FULL_Data_Iteration1_ForGWAS.csv')))


In [None]:
head(FULL_Data_Iteration1_ForGWAS)

summary(FULL_Data_Iteration1_ForGWAS$LDL_norm)
summary(FULL_Data_Iteration1_ForGWAS$LDL_raw)