# Initial Setup

In [None]:
lapply(c('viridis', 'ggthemes', 'skimr', 'fuzzyjoin', 'pryr'),
       function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )
lapply(c('IRanges'),
       function(pkg_name) { if(! pkg_name %in% installed.packages()) { BiocManager::install(pkg_name)} } )
library(plyr)
library(viridis)    # A nice color scheme for plots.
library(ggthemes)   # Common themes to change the look and feel of plots.
library(scales)     # Graphical scales map data to aesthetics in plots.
library(skimr)      # Better summaries of data.
library(lubridate)  # Date library from the tidyverse.
library(bigrquery)  # BigQuery R client.
library(tidyverse)  # Data wrangling packages.
library(fuzzyjoin)
library(lubridate)
library(pryr)       # For memory profiling.

In [None]:
## BigQuery setup.
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')
# Get the BigQuery curated dataset for the current workspace context.
CDR <- Sys.getenv('WORKSPACE_CDR')

WORKSPACE_BUCKET <- Sys.getenv('WORKSPACE_BUCKET')

## Plot setup.
theme_set(theme_bw(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

# Functions

In [None]:
writeFile <- function(FileToSave, FileName) {
    # This code saves your dataframe into a csv file in a "data" folder in Google Bucket
    my_dataframe <- FileToSave
    destination_filename <- FileName

    # store the dataframe in current workspace
    write_excel_csv(my_dataframe, destination_filename)

    # Get the bucket name
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

    # Copy the file from current workspace to the bucket
    system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

    # Check if file is in the bucket
    system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)
}

# AOU Data

## Read AOU files

In [None]:
# Read all the files --- These files were written "AOU_UKB_phenotypes" Notebook

LDL <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/LDL_ForAdjNor.20210714.csv')))
HDL <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/HDL_ForAdjNor.20210714.csv')))
TC <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/TC_ForAdjNor.20210714.csv')))
TG <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/TG_ForAdjNor.20210714.csv')))


In [None]:
dim(LDL)
colnames(LDL)
head(LDL)
table(LDL$statin_use)

## Lipid Adjustment 

In [None]:
#LDL adjustment based on TG/LDL values
#If TG > 400, then LDL = NA
#If LDL < 10, then LDL=NA
#LDL and TC adjustment based on Statin (Lipid lowering medication)
#If STATIN is used, LDL_ADJ = LDL/0.7
#If STATIN is used, TOTAL_ADJ = TC/0.8
#TG adjustment
#TG_LOG = log(TG)

In [None]:
LDL_data <- LDL %>% select("person_id", "gender", "age", "age2", "statin_use", "title", "value_as_number")
HDL_data <- HDL %>% select("person_id", "gender", "age", "age2", "statin_use", "title", "value_as_number")
TC_data <- TC %>% select("person_id", "gender", "age", "age2", "statin_use", "title", "value_as_number")
TG_data <- TG %>% select("person_id", "gender", "age", "age2", "statin_use", "title", "value_as_number")

dim(LDL_data)
dim(HDL_data)
dim(TC_data)
dim(TG_data)

In [None]:
length(intersect(LDL_data$person_id, HDL_data$person_id))
length(intersect(LDL_data$person_id, TC_data$person_id))
length(intersect(LDL_data$person_id, TG_data$person_id))
length(intersect(HDL_data$person_id, TC_data$person_id))
length(intersect(HDL_data$person_id, TG_data$person_id))
length(intersect(TC_data$person_id, TG_data$person_id))

length(unique(c(LDL_data$person_id, HDL_data$person_id, TC_data$person_id, TG_data$person_id)))

In [None]:
#library(plyr)
# Join all data starting TG since it holds most of the samples
FullData <- join_all(list(TG_data, HDL_data, TC_data,  LDL_data), by='person_id', type='left')
dim(FullData)
head(FullData)
tail(FullData)

In [None]:
set1 <- c('value_as_number')
cols <- grep(set1, colnames(FullData))
AOUdata <- FullData[,c(1,2,3,4,5,cols)]
colnames(AOUdata)[6:9] <- c("TG", "HDL", "TC", "LDL")
dim(AOUdata)

# Make NA statin_use to FALSE --- so that LDL adjustment doesn’t error out
AOUdata$statin_use[which(is.na(AOUdata$statin_use))] <- "FALSE"
head(AOUdata)

In [None]:
#LDL adjustment based on TG/LDL values
#If TG > 400, then LDL = NA
#If LDL < 10, then LDL=NA

AOUdata$LDL <- ifelse(AOUdata$TG > 400, NA, AOUdata$LDL)
AOUdata$LDL <- ifelse(AOUdata$LDL < 10, NA, AOUdata$LDL)
#If STATIN is used, LDL_ADJ = LDL/0.7
AOUdata$LDLadjusted <- ifelse(AOUdata$statin_use == "TRUE", AOUdata$LDL/0.7, AOUdata$LDL)
#If STATIN is used, TOTAL_ADJ = TC/0.8
AOUdata$TCadjusted <- ifelse(AOUdata$statin_use == "TRUE", AOUdata$TC/0.8, AOUdata$TC)
#TG adjustment
AOUdata$TGadjusted <- log(AOUdata$TG)

In [None]:
dim(AOUdata)
head(AOUdata)
dim(na.omit(AOUdata)) # Just checking to understand how many samples are lost

In [None]:
# Remove non-ACSII characters from sampleIds
# Make geneder to Sex
AOUdata$person_id <- stringi::stri_trans_general(AOUdata$person_id, "latin-ascii")
colnames(AOUdata)[2] <- "sex"
dim(AOUdata)
head(AOUdata)

## Write Iteration 2 data - with more samples 

In [None]:
writeFile(AOUdata, "AOUdata_iteration2.csv")

# UKB data

## Read UKB files

In [None]:
# Read the UKB adjusted file --- These files were written "AOU_UKB_phenotypes" Notebook

UKBdata <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/UKB_NAomitted_Data.csv')))
dim(UKBdata)

In [None]:
head(UKBdata)

# Read PCs - UKB & AOU

In [None]:
# Raw PCs
raw_pcs <- readr::read_tsv(
    pipe(str_glue('gsutil cat gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210714/pcs.tsv'))
)
raw_pcs$person_id <- raw_pcs$s

# Create dataframe
pcs <- raw_pcs %>%
    extract(col = scores,
            into = c('pcs'),
            regex = '([^\\[\\]]+)') %>%
    separate(col = pcs,
             into = c('pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10'),
             sep = ',')
dim(pcs)
head(pcs)



## Subset PCs to AOU and UKB

In [None]:
# No.of AOU samples in the PC matrix
aou_pcs <- pcs %>% filter(cohort == 'aou')
ukb_pcs <- pcs %>% filter(cohort == 'ukb')

dim(aou_pcs)
dim(ukb_pcs)

## Reorder PC matrix to Data matrix sample Ids

In [None]:
# AOU PC data
AOU_PCmatrix <- aou_pcs[match(AOUdata$person_id, aou_pcs$s), ]
dim(AOU_PCmatrix)

head(AOU_PCmatrix)
dim(na.omit(AOU_PCmatrix))

In [None]:
# UKB PC data
UKB_PCmatrix <- ukb_pcs[match(UKBdata$eid_WES, ukb_pcs$s), ]
dim(UKB_PCmatrix)

head(UKB_PCmatrix)
dim(na.omit(UKB_PCmatrix))

# Merge Lipids data from UKB/AOU to respective PC matrix

In [None]:
# Full Merge

UKB_Full_Data <- cbind(UKBdata, UKB_PCmatrix)
AOU_Full_Data <- cbind(AOUdata, AOU_PCmatrix)

dim(UKB_Full_Data)
dim(AOU_Full_Data)

In [None]:
colnames(UKB_Full_Data)
colnames(AOU_Full_Data)

In [None]:
colnames(UKB_Full_Data)[11] <- "TG_adjusted_log"

In [None]:
writeFile(UKB_Full_Data, "UKB_Full_Data.csv")
writeFile(AOU_Full_Data, "AOU_Full_Data.csv")

# Normalization

## Read the Full_Data file for UKB and AOU

In [None]:
UKBdata_ForNorm <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/UKB_Full_Data.csv')))
AOUdata_ForNorm <- readr::read_csv(pipe(str_glue('gsutil cat {WORKSPACE_BUCKET}/data/AOU_Full_Data.csv')))

dim(UKBdata_ForNorm)
dim(AOUdata_ForNorm)

## Normalization steps

### UKB

In [None]:
colnames(UKBdata_ForNorm)

In [None]:
# Normalized Phenotypes --- Each lipids separately to over come the NA issue

# LDL
UKB_LDLmat <- UKBdata_ForNorm %>% select(eid, eid_WES, Sex_numeric, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin0, ldl, ldladj)
UKB_LDLmat <- na.omit(UKB_LDLmat)
UKB_LDLmat$ldladj.resid <- resid(lm(ldladj ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKB_LDLmat))
UKB_LDLmat$ldladj.norm <- sd(UKB_LDLmat$ldladj)*scale(qnorm((rank(UKB_LDLmat$ldladj.resid,na.last="keep")-0.5)/length(UKB_LDLmat$ldladj.resid)))
UKB_LDLmat$CohortName <- rep("UKB", nrow(UKB_LDLmat))
colnames(UKB_LDLmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "LDLraw", "LDLadj", "LDLresid", "LDLnorm", "cohort")

head(UKB_LDLmat)
dim(UKB_LDLmat)

In [None]:
# HDL
UKB_HDLmat <- UKBdata_ForNorm %>% select(eid, eid_WES, Sex_numeric, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin0, hdl, hdladj)
UKB_HDLmat <- na.omit(UKB_HDLmat)
UKB_HDLmat$hdladj.resid <- resid(lm(hdladj ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKB_HDLmat))
UKB_HDLmat$hdladj.norm <- sd(UKB_HDLmat$hdladj)*scale(qnorm((rank(UKB_HDLmat$hdladj.resid,na.last="keep")-0.5)/length(UKB_HDLmat$hdladj.resid)))
UKB_HDLmat$CohortName <- rep("UKB", nrow(UKB_HDLmat))
colnames(UKB_HDLmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "HDLraw", "HDLadj", "HDLresid", "HDLnorm", "cohort")

head(UKB_HDLmat)
dim(UKB_HDLmat)

In [None]:
# TC
UKB_TCmat <- UKBdata_ForNorm %>% select(eid, eid_WES, Sex_numeric, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin0, chol, choladj)
UKB_TCmat <- na.omit(UKB_TCmat)
UKB_TCmat$choladj.resid <- resid(lm(choladj ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKB_TCmat))
UKB_TCmat$choladj.norm <- sd(UKB_TCmat$choladj)*scale(qnorm((rank(UKB_TCmat$choladj.resid,na.last="keep")-0.5)/length(UKB_TCmat$choladj.resid)))
UKB_TCmat$CohortName <- rep("UKB", nrow(UKB_TCmat))
colnames(UKB_TCmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "TCraw", "TCadj", "TCresid", "TCnorm", "cohort")

head(UKB_TCmat)
dim(UKB_TCmat)

In [None]:
# TG
UKB_TGmat <- UKBdata_ForNorm %>% select(eid, eid_WES, Sex_numeric, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin0, trig, TG_adjusted_log)
UKB_TGmat <- na.omit(UKB_TGmat)
UKB_TGmat$trigadj.resid <- resid(lm(TG_adjusted_log ~ Sex_numeric+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = UKB_TGmat))
UKB_TGmat$trigadj.norm <- sd(UKB_TGmat$TG_adjusted_log)*scale(qnorm((rank(UKB_TGmat$trigadj.resid,na.last="keep")-0.5)/length(UKB_TGmat$trigadj.resid)))
UKB_TGmat$CohortName <- rep("UKB", nrow(UKB_TGmat))
colnames(UKB_TGmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "TGraw", "TGadj", "TGresid", "TGnorm", "cohort")

head(UKB_TGmat)
dim(UKB_TGmat)

### AOU

In [None]:
colnames(AOUdata_ForNorm)

In [None]:
# Normalized Phenotypes --- Each lipids separately to over come the NA issue

# LDL
AOU_LDLmat <- AOUdata_ForNorm %>% select(person_id, person_id_1, sex, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin_use, LDL, LDLadjusted)
AOU_LDLmat <- na.omit(AOU_LDLmat)
AOU_LDLmat$ldladj.resid <- resid(lm(LDLadjusted ~ sex+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOU_LDLmat))
AOU_LDLmat$ldladj.norm <- sd(AOU_LDLmat$LDLadjusted)*scale(qnorm((rank(AOU_LDLmat$ldladj.resid,na.last="keep")-0.5)/length(AOU_LDLmat$ldladj.resid)))
AOU_LDLmat$CohortName <- rep("AOU", nrow(AOU_LDLmat))
colnames(AOU_LDLmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "LDLraw", "LDLadj", "LDLresid", "LDLnorm", "cohort")

head(AOU_LDLmat)
dim(AOU_LDLmat)

In [None]:
# HDL
AOU_HDLmat <- AOUdata_ForNorm %>% select(person_id, person_id_1, sex, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin_use, HDL)
AOU_HDLmat <- na.omit(AOU_HDLmat)
AOU_HDLmat$HDLadjusted <- AOU_HDLmat$HDL

AOU_HDLmat$hdladj.resid <- resid(lm(HDLadjusted ~ sex+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOU_HDLmat))
AOU_HDLmat$hdladj.norm <- sd(AOU_HDLmat$HDLadjusted)*scale(qnorm((rank(AOU_HDLmat$hdladj.resid,na.last="keep")-0.5)/length(AOU_HDLmat$hdladj.resid)))
AOU_HDLmat$CohortName <- rep("AOU", nrow(AOU_HDLmat))
colnames(AOU_HDLmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "HDLraw", "HDLadj", "HDLresid", "HDLnorm", "cohort")

head(AOU_HDLmat)
dim(AOU_HDLmat)

In [None]:
# TC
AOU_TCmat <- AOUdata_ForNorm %>% select(person_id, person_id_1, sex, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin_use, TC, TCadjusted)
AOU_TCmat <- na.omit(AOU_TCmat)

AOU_TCmat$choladj.resid <- resid(lm(TCadjusted ~ sex+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOU_TCmat))
AOU_TCmat$choladj.norm <- sd(AOU_TCmat$TCadjusted)*scale(qnorm((rank(AOU_TCmat$choladj.resid,na.last="keep")-0.5)/length(AOU_TCmat$choladj.resid)))
AOU_TCmat$CohortName <- rep("AOU", nrow(AOU_TCmat))
colnames(AOU_TCmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "TCraw", "TCadj", "TCresid", "TCnorm", "cohort")

head(AOU_TCmat)
dim(AOU_TCmat)

In [None]:
# TG
AOU_TGmat <- AOUdata_ForNorm %>% select(person_id, person_id_1, sex, age, age2, pc1, pc2, pc3, pc4, pc5, pc6, pc7, pc8, pc9, pc10, statin_use, TG, TGadjusted)
AOU_TGmat <- na.omit(AOU_TGmat)

AOU_TGmat$trigadj.resid <- resid(lm(TGadjusted ~ sex+age+age2+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+pc9+pc10, data = AOU_TGmat))
AOU_TGmat$trigadj.norm <- sd(AOU_TGmat$TGadjusted)*scale(qnorm((rank(AOU_TGmat$trigadj.resid,na.last="keep")-0.5)/length(AOU_TGmat$trigadj.resid)))
AOU_TGmat$CohortName <- rep("AOU", nrow(AOU_TGmat))
colnames(AOU_TGmat) <- c("eid", "sampleid", "sex", "age", "age2", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9", "pc10", "statin", "TGraw", "TGadj", "TGresid", "TGnorm", "cohort")

head(AOU_TGmat)
dim(AOU_TGmat)

## Merge UKB and AOU data for each lipids

### LDL

In [None]:
MergedData_LDL_Iteration2_ForGWAS <- rbind(UKB_LDLmat, AOU_LDLmat)
dim(MergedData_LDL_Iteration2_ForGWAS)


# Non ASCII removal
MergedData_LDL_Iteration2_ForGWAS$eid <- stringi::stri_trans_general(MergedData_LDL_Iteration2_ForGWAS$eid, "latin-ascii")
MergedData_LDL_Iteration2_ForGWAS$sampleid <- stringi::stri_trans_general(MergedData_LDL_Iteration2_ForGWAS$sampleid, "latin-ascii")
# Sex column refinement
MergedData_LDL_Iteration2_ForGWAS$sex[which(MergedData_LDL_Iteration2_ForGWAS$sex == 0)] <- "Female"
MergedData_LDL_Iteration2_ForGWAS$sex[which(MergedData_LDL_Iteration2_ForGWAS$sex == 1)] <- "Male"
MergedData_LDL_Iteration2_ForGWAS$sex[which(MergedData_LDL_Iteration2_ForGWAS$sex == "Gender Identity: Non Binary")] <- NA
MergedData_LDL_Iteration2_ForGWAS$sex[which(MergedData_LDL_Iteration2_ForGWAS$sex == "PMI: Skip")] <- NA
MergedData_LDL_Iteration2_ForGWAS$sex[which(MergedData_LDL_Iteration2_ForGWAS$sex == "Not man only, not woman only, prefer not to answer, or skipped")] <- NA

table(MergedData_LDL_Iteration2_ForGWAS$sex)


### HDL

In [None]:
MergedData_HDL_Iteration2_ForGWAS <- rbind(UKB_HDLmat, AOU_HDLmat)
dim(MergedData_HDL_Iteration2_ForGWAS)

# Non ASCII removal
MergedData_HDL_Iteration2_ForGWAS$eid <- stringi::stri_trans_general(MergedData_HDL_Iteration2_ForGWAS$eid, "latin-ascii")
MergedData_HDL_Iteration2_ForGWAS$sampleid <- stringi::stri_trans_general(MergedData_HDL_Iteration2_ForGWAS$sampleid, "latin-ascii")
# Sex column refinement
MergedData_HDL_Iteration2_ForGWAS$sex[which(MergedData_HDL_Iteration2_ForGWAS$sex == 0)] <- "Female"
MergedData_HDL_Iteration2_ForGWAS$sex[which(MergedData_HDL_Iteration2_ForGWAS$sex == 1)] <- "Male"
MergedData_HDL_Iteration2_ForGWAS$sex[which(MergedData_HDL_Iteration2_ForGWAS$sex == "Gender Identity: Non Binary")] <- NA
MergedData_HDL_Iteration2_ForGWAS$sex[which(MergedData_HDL_Iteration2_ForGWAS$sex == "PMI: Skip")] <- NA
MergedData_HDL_Iteration2_ForGWAS$sex[which(MergedData_HDL_Iteration2_ForGWAS$sex == "Not man only, not woman only, prefer not to answer, or skipped")] <- NA

table(MergedData_HDL_Iteration2_ForGWAS$sex)


### TC

In [None]:
MergedData_TC_Iteration2_ForGWAS <- rbind(UKB_TCmat, AOU_TCmat)
dim(MergedData_TC_Iteration2_ForGWAS)

# Non ASCII removal
MergedData_TC_Iteration2_ForGWAS$eid <- stringi::stri_trans_general(MergedData_TC_Iteration2_ForGWAS$eid, "latin-ascii")
MergedData_TC_Iteration2_ForGWAS$sampleid <- stringi::stri_trans_general(MergedData_TC_Iteration2_ForGWAS$sampleid, "latin-ascii")
# Sex column refinement
MergedData_TC_Iteration2_ForGWAS$sex[which(MergedData_TC_Iteration2_ForGWAS$sex == 0)] <- "Female"
MergedData_TC_Iteration2_ForGWAS$sex[which(MergedData_TC_Iteration2_ForGWAS$sex == 1)] <- "Male"
MergedData_TC_Iteration2_ForGWAS$sex[which(MergedData_TC_Iteration2_ForGWAS$sex == "Gender Identity: Non Binary")] <- NA
MergedData_TC_Iteration2_ForGWAS$sex[which(MergedData_TC_Iteration2_ForGWAS$sex == "PMI: Skip")] <- NA
MergedData_TC_Iteration2_ForGWAS$sex[which(MergedData_TC_Iteration2_ForGWAS$sex == "Not man only, not woman only, prefer not to answer, or skipped")] <- NA

table(MergedData_TC_Iteration2_ForGWAS$sex)


### TG

In [None]:
MergedData_TG_Iteration2_ForGWAS <- rbind(UKB_TGmat, AOU_TGmat)
dim(MergedData_TG_Iteration2_ForGWAS)

# Non ASCII removal
MergedData_TG_Iteration2_ForGWAS$eid <- stringi::stri_trans_general(MergedData_TG_Iteration2_ForGWAS$eid, "latin-ascii")
MergedData_TG_Iteration2_ForGWAS$sampleid <- stringi::stri_trans_general(MergedData_TG_Iteration2_ForGWAS$sampleid, "latin-ascii")
# Sex column refinement
MergedData_TG_Iteration2_ForGWAS$sex[which(MergedData_TG_Iteration2_ForGWAS$sex == 0)] <- "Female"
MergedData_TG_Iteration2_ForGWAS$sex[which(MergedData_TG_Iteration2_ForGWAS$sex == 1)] <- "Male"
MergedData_TG_Iteration2_ForGWAS$sex[which(MergedData_TG_Iteration2_ForGWAS$sex == "Gender Identity: Non Binary")] <- NA
MergedData_TG_Iteration2_ForGWAS$sex[which(MergedData_TG_Iteration2_ForGWAS$sex == "PMI: Skip")] <- NA
MergedData_TG_Iteration2_ForGWAS$sex[which(MergedData_TG_Iteration2_ForGWAS$sex == "Not man only, not woman only, prefer not to answer, or skipped")] <- NA

table(MergedData_TG_Iteration2_ForGWAS$sex)


## Write Data for GWAS

In [None]:
writeFile(MergedData_LDL_Iteration2_ForGWAS, "MergedData_LDL_Iteration2_ForGWAS.csv")
writeFile(MergedData_HDL_Iteration2_ForGWAS, "MergedData_HDL_Iteration2_ForGWAS.csv")
writeFile(MergedData_TC_Iteration2_ForGWAS, "MergedData_TC_Iteration2_ForGWAS.csv")
writeFile(MergedData_TG_Iteration2_ForGWAS, "MergedData_TG_Iteration2_ForGWAS.csv")