# Plot results for METAL analysis results

In this notebook we review and explore the *All of Us* data for lipids phenotypes, covariates, and GWAS results.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#). Specifically this is for the portion of the project that is the **siloed** analysis.

# Setup 

In [None]:
lapply(c('hrbrthemes', 'skimr', 'qqman'),
       function(pkg) { if(! pkg %in% installed.packages()) { 
           install.packages(pkg)
       } } )

In [None]:
library(grid)
library(gridExtra)
library(hrbrthemes)
library(qqman)
library(readxl)
library(scales)
library(skimr)
library(tidyverse)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

# Created via notebook aou_workbench_siloed_analyses/09_metal_meta_analysis.ipynb
HDL='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_HDL_1.tbl'
LDL='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_LDL_1.tbl'
TC='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_TC_1.tbl'
TG='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_TG_1.tbl'

# Created via notebook aou_workbench_siloed_analyses/08_aggregate_analysis.ipynb
AOU_HDL = 'gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/pheno/20220323/fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5_data_aou_regenie_20220318_aou_alpha3_lipids_regenie_step2_HDL_norm_aggregated.tsv'
AOU_LDL = 'gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/pheno/20220323/fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5_data_aou_regenie_20220318_aou_alpha3_lipids_regenie_step2_LDL_adjusted_norm_aggregated.tsv'
AOU_TC = 'gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/pheno/20220323/fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5_data_aou_regenie_20220318_aou_alpha3_lipids_regenie_step2_TC_adjusted_norm_aggregated.tsv'
AOU_TG = 'gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/pheno/20220323/fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5_data_aou_regenie_20220318_aou_alpha3_lipids_regenie_step2_TG_adjusted_norm_aggregated.tsv'

# Pooled results
PooledHDL = 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_HDL_norm.regenie'
PooledLDL = 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_LDL_adjusted_norm.regenie'
PooledTC = 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TC_adjusted_norm.regenie'
PooledTG = 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TG_adjusted_norm.regenie'


PLOT_SUBTITLE <- 'Source: AOU - UKB meta analysis'

# Load AOU-Siloed results

In [None]:
HDLsiloed = read_table(pipe(str_glue('gsutil cat {AOU_HDL}')))
LDLsiloed = read_table(pipe(str_glue('gsutil cat {AOU_LDL}')))
TCsiloed = read_table(pipe(str_glue('gsutil cat {AOU_TC}')))
TGsiloed = read_table(pipe(str_glue('gsutil cat {AOU_TG}')))


In [None]:
head(HDLsiloed)

# Load AOU-UKB Pooled results

In [None]:
HDLpooled = read_table(pipe(str_glue('gsutil cat {PooledHDL}')))
LDLpooled = read_table(pipe(str_glue('gsutil cat {PooledLDL}')))
TCpooled = read_table(pipe(str_glue('gsutil cat {PooledTC}')))
TGpooled = read_table(pipe(str_glue('gsutil cat {PooledTG}')))


In [None]:
HDLpooled$ID <- gsub("_", ":", HDLpooled$ID)
HDLpooled$ID <- gsub("chr", "", HDLpooled$ID)
LDLpooled$ID <- gsub("_", ":", LDLpooled$ID)
LDLpooled$ID <- gsub("chr", "", LDLpooled$ID)
TCpooled$ID <- gsub("_", ":", TCpooled$ID)
TCpooled$ID <- gsub("chr", "", TCpooled$ID)
TGpooled$ID <- gsub("_", ":", TGpooled$ID)
TGpooled$ID <- gsub("chr", "", TGpooled$ID)

In [None]:
Remove <- match("EXTRA", colnames(HDLpooled))
HDLpooled <- HDLpooled[,-Remove]
LDLpooled <- LDLpooled[,-Remove]
TCpooled <- TCpooled[,-Remove]
TGpooled <- TGpooled[,-Remove]

In [None]:
head(LDLpooled)

# Load the METAL results

In [None]:
HDLmetal = read_table(pipe(str_glue('gsutil cat {HDL}')))
LDLmetal = read_table(pipe(str_glue('gsutil cat {LDL}')))
TCmetal = read_table(pipe(str_glue('gsutil cat {TC}')))
TGmetal = read_table(pipe(str_glue('gsutil cat {TG}')))


In [None]:
dim(HDLmetal)
head(HDLmetal)

In [None]:
HDLmetal$Chr <- as.numeric(unlist(sapply(strsplit(HDLmetal$MarkerName, "\\:"), `[`, 1)))
HDLmetal$Pos <- as.numeric(unlist(sapply(strsplit(HDLmetal$MarkerName, "\\:"), `[`, 2)))
LDLmetal$Chr <- as.numeric(unlist(sapply(strsplit(LDLmetal$MarkerName, "\\:"), `[`, 1)))
LDLmetal$Pos <- as.numeric(unlist(sapply(strsplit(LDLmetal$MarkerName, "\\:"), `[`, 2)))
TCmetal$Chr <- as.numeric(unlist(sapply(strsplit(TCmetal$MarkerName, "\\:"), `[`, 1)))
TCmetal$Pos <- as.numeric(unlist(sapply(strsplit(TCmetal$MarkerName, "\\:"), `[`, 2)))
TGmetal$Chr <- as.numeric(unlist(sapply(strsplit(TGmetal$MarkerName, "\\:"), `[`, 1)))
TGmetal$Pos <- as.numeric(unlist(sapply(strsplit(TGmetal$MarkerName, "\\:"), `[`, 2)))

In [None]:
HDLmetal$Name <- paste0(HDLmetal$Chr,":", HDLmetal$Pos, ":", toupper(HDLmetal$Allele1), ":", toupper(HDLmetal$Allele2))
LDLmetal$Name <- paste0(LDLmetal$Chr,":", LDLmetal$Pos, ":", toupper(LDLmetal$Allele1), ":", toupper(LDLmetal$Allele2))
TCmetal$Name <- paste0(TCmetal$Chr,":", TCmetal$Pos, ":", toupper(TCmetal$Allele1), ":", toupper(TCmetal$Allele2))
TGmetal$Name <- paste0(TGmetal$Chr,":", TGmetal$Pos, ":", toupper(TGmetal$Allele1), ":", toupper(TGmetal$Allele2))

In [None]:
HDLmetal$FlipBeta <- ifelse(HDLmetal$MarkerName==HDLmetal$Name, HDLmetal$Effect, -(HDLmetal$Effect))
LDLmetal$FlipBeta <- ifelse(LDLmetal$MarkerName==LDLmetal$Name, LDLmetal$Effect, -(LDLmetal$Effect))
TCmetal$FlipBeta <- ifelse(TCmetal$MarkerName==TCmetal$Name, TCmetal$Effect, -(TCmetal$Effect))
TGmetal$FlipBeta <- ifelse(TGmetal$MarkerName==TGmetal$Name, TGmetal$Effect, -(TGmetal$Effect))

In [None]:
head(LDLmetal)
dim(LDLmetal)

In [None]:
dim(LDLmetal[which(LDLmetal$`P-value` <= 5E-08), ])

In [None]:
table(LDLmetal$Direction)

In [None]:
23126+130798+130053+22847

In [None]:
HDLremove <- which(HDLmetal$Direction == "-?"| HDLmetal$Direction == "?-"| HDLmetal$Direction == "?+"| HDLmetal$Direction == "+?")
LDLremove <- which(LDLmetal$Direction == "-?"| LDLmetal$Direction == "?-"| LDLmetal$Direction == "?+"| LDLmetal$Direction == "+?")
TCremove <- which(TCmetal$Direction == "-?"| TCmetal$Direction == "?-"| TCmetal$Direction == "?+"| TCmetal$Direction == "+?")
TGremove <- which(TGmetal$Direction == "-?"| TGmetal$Direction == "?-"| TGmetal$Direction == "?+"| TGmetal$Direction == "+?")

length(HDLremove)
length(LDLremove)
length(TCremove)
length(TGremove)


In [None]:
HDLmetal_trimmed <- HDLmetal[-HDLremove, ]
LDLmetal_trimmed <- LDLmetal[-LDLremove, ]
TCmetal_trimmed <- TCmetal[-TCremove, ]
TGmetal_trimmed <- TGmetal[-TGremove, ]

In [None]:
dim(HDLmetal_trimmed)
dim(LDLmetal_trimmed)
dim(TCmetal_trimmed)
dim(TGmetal_trimmed)

In [None]:
head(LDLmetal_trimmed)

In [None]:
LDLmetal_trimmed_sig <- LDLmetal_trimmed[which(LDLmetal_trimmed$`P-value` <= 5E-08), ]
dim(LDLmetal_trimmed_sig)

In [None]:
length(unique(LDLmetal_trimmed_sig$MarkerName))

In [None]:
head(LDLmetal_trimmed_sig)

In [None]:
sort(table(LDLmetal_trimmed_sig$Chr), decreasing = T)

# Plot METAL results

## Using Full METAL results

In [None]:
plot_manhattan_and_qq <- function(metal_results, manhattan_title, qq_title) {
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(metal_results,
              chr='Chr',
              bp='Pos',
              snp='MarkerName',
              p='P-value',
              logp=TRUE,
              annotateTop = FALSE,
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = manhattan_title,
              sub = PLOT_SUBTITLE
             )

    qq(metal_results$`P-value`,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = qq_title,
       sub = PLOT_SUBTITLE)
}

In [None]:
plot_manhattan_and_qq(HDLmetal, "Meta-analysis HDL", "Meta-analysis HDL")

In [None]:
plot_manhattan_and_qq(LDLmetal, "Meta-analysis LDL", "Meta-analysis LDL")

In [None]:
plot_manhattan_and_qq(TCmetal, "Meta-analysis TC", "Meta-analysis TC")

In [None]:
plot_manhattan_and_qq(TGmetal, "Meta-analysis TG", "Meta-analysis TG")

## Using SNPs present in both cohort from METAL results


In [None]:
plot_manhattan_and_qq(HDLmetal_trimmed, "Meta-analysis HDL - SNPs in Both cohort", "Meta-analysis HDL - SNPs in Both cohort")

In [None]:
plot_manhattan_and_qq(LDLmetal_trimmed, "Meta-analysis LDL - SNPs in Both cohort", "Meta-analysis LDL - SNPs in Both cohort")


In [None]:
plot_manhattan_and_qq(TCmetal_trimmed, "Meta-analysis TC - SNPs in Both cohort", "Meta-analysis TC - SNPs in Both cohort")


In [None]:
plot_manhattan_and_qq(TGmetal_trimmed, "Meta-analysis TG - SNPs in Both cohort", "Meta-analysis TG - SNPs in Both cohort")


# Correlation plots of AOU-Siloed and Meta-analysis

## Using Full METAL results

In [None]:
CommonHDL <- merge(HDLmetal, HDLsiloed, by.x="MarkerName", by.y="ID")
CommonLDL <- merge(LDLmetal, LDLsiloed, by.x="MarkerName", by.y="ID")
CommonTC <- merge(TCmetal, TCsiloed, by.x="MarkerName", by.y="ID")
CommonTG <- merge(TGmetal, TGsiloed, by.x="MarkerName", by.y="ID")

In [None]:
CommonHDL_sig <- CommonHDL[which(CommonHDL$`P-value` <= 5E-08), ]
CommonLDL_sig <- CommonLDL[which(CommonLDL$`P-value` <= 5E-08), ]
CommonTC_sig <- CommonTC[which(CommonTC$`P-value` <= 5E-08), ]
CommonTG_sig <- CommonTG[which(CommonTG$`P-value` <= 5E-08), ]

In [None]:
dim(CommonHDL_sig)
dim(CommonLDL_sig)
dim(CommonTC_sig)
dim(CommonTG_sig)

In [None]:
t(head(CommonHDL_sig))

In [None]:
t(head(CommonLDL_sig))

In [None]:
HDLCor <- round(cor(CommonHDL_sig$FlipBeta, CommonHDL_sig$BETA), digits=3)
LDLCor <- round(cor(CommonLDL_sig$FlipBeta, CommonLDL_sig$BETA), digits=3)
TCCor <- round(cor(CommonTC_sig$FlipBeta, CommonTC_sig$BETA), digits=3)
TGCor <- round(cor(CommonTG_sig$FlipBeta, CommonTG_sig$BETA), digits=3)

In [None]:
HDLCor
LDLCor
TCCor
TGCor

In [None]:
PlotCorr <- function(Data, CorVal, Title){
ggplot(Data, aes(x = FlipBeta, y = BETA)) +
    geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x =max(Data$FlipBeta),#
                 y =min(Data$BETA),#
                 hjust = 'right',
                 vjust = 0,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('correlation: ',CorVal))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue(Title,': meta-analysis vs AOU-siloed'),
             x='Meta-analysis Effects',
             y='AOU-siloed Effects',
             caption = PLOT_SUBTITLE)
}

In [None]:
PlotCorr(CommonHDL_sig, HDLCor, "HDL")

In [None]:
PlotCorr(CommonLDL_sig, LDLCor, "LDL")

In [None]:
PlotCorr(CommonTC_sig, TCCor, "TC")

In [None]:
PlotCorr(CommonTG_sig, TGCor, "TG")

## Using SNPs present in both cohort from METAL results

In [None]:
CommonHDL <- merge(HDLmetal_trimmed, HDLsiloed, by.x="MarkerName", by.y="ID")
CommonLDL <- merge(LDLmetal_trimmed, LDLsiloed, by.x="MarkerName", by.y="ID")
CommonTC <- merge(TCmetal_trimmed, TCsiloed, by.x="MarkerName", by.y="ID")
CommonTG <- merge(TGmetal_trimmed, TGsiloed, by.x="MarkerName", by.y="ID")

CommonHDL_sig <- CommonHDL[which(CommonHDL$`P-value` <= 5E-08), ]
CommonLDL_sig <- CommonLDL[which(CommonLDL$`P-value` <= 5E-08), ]
CommonTC_sig <- CommonTC[which(CommonTC$`P-value` <= 5E-08), ]
CommonTG_sig <- CommonTG[which(CommonTG$`P-value` <= 5E-08), ]

dim(CommonHDL_sig)
dim(CommonLDL_sig)
dim(CommonTC_sig)
dim(CommonTG_sig)


HDLCor <- round(cor(CommonHDL_sig$FlipBeta, CommonHDL_sig$BETA), digits=3)
LDLCor <- round(cor(CommonLDL_sig$FlipBeta, CommonLDL_sig$BETA), digits=3)
TCCor <- round(cor(CommonTC_sig$FlipBeta, CommonTC_sig$BETA), digits=3)
TGCor <- round(cor(CommonTG_sig$FlipBeta, CommonTG_sig$BETA), digits=3)

HDLCor
LDLCor
TCCor
TGCor

PlotCorr(CommonHDL_sig, HDLCor, "HDL - SNPs present in both cohort")
PlotCorr(CommonLDL_sig, LDLCor, "LDL - SNPs present in both cohort")
PlotCorr(CommonTC_sig, TCCor, "TC - SNPs present in both cohort")
PlotCorr(CommonTG_sig, TGCor, "TG - SNPs present in both cohort")

# Correlation plots of Pooled and Meta-analysis

In [None]:
PooledCommonHDL <- merge(HDLmetal, HDLpooled, by.x="MarkerName", by.y="ID")
PooledCommonLDL <- merge(LDLmetal, LDLpooled, by.x="MarkerName", by.y="ID")
PooledCommonTC <- merge(TCmetal, TCpooled, by.x="MarkerName", by.y="ID")
PooledCommonTG <- merge(TGmetal, TGpooled, by.x="MarkerName", by.y="ID")

In [None]:
dim(PooledCommonLDL)

In [None]:
t(head(PooledCommonLDL))

In [None]:
PooledCommonHDL_sig <- na.omit(PooledCommonHDL[which(PooledCommonHDL$`P-value` <= 5E-08), ])
PooledCommonLDL_sig <- na.omit(PooledCommonLDL[which(PooledCommonLDL$`P-value` <= 5E-08), ])
PooledCommonTC_sig <- na.omit(PooledCommonTC[which(PooledCommonTC$`P-value` <= 5E-08), ])
PooledCommonTG_sig <- na.omit(PooledCommonTG[which(PooledCommonTG$`P-value` <= 5E-08), ])


In [None]:
dim(PooledCommonLDL_sig)

In [None]:
t(head(PooledCommonLDL_sig))

In [None]:
PooledHDLCor <- round(cor(PooledCommonHDL_sig$FlipBeta, PooledCommonHDL_sig$BETA), digits=3)
PooledLDLCor <- round(cor(PooledCommonLDL_sig$FlipBeta, PooledCommonLDL_sig$BETA), digits=3)
PooledTCCor <- round(cor(PooledCommonTC_sig$FlipBeta, PooledCommonTC_sig$BETA), digits=3)
PooledTGCor <- round(cor(PooledCommonTG_sig$FlipBeta, PooledCommonTG_sig$BETA), digits=3)


In [None]:
PooledHDLCor
PooledLDLCor
PooledTCCor
PooledTGCor

In [None]:
PlotCorr <- function(Data, CorVal, Title){
ggplot(Data, aes(x = FlipBeta, y = BETA)) +
    geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x =max(Data$FlipBeta),#
                 y =min(Data$BETA),#
                 hjust = 'right',
                 vjust = 0,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('correlation: ',CorVal))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue(Title,': meta-analysis vs pooled-analysis'),
             x='Meta-analysis Effects',
             y='Pooled-analysis Effects',
             caption = PLOT_SUBTITLE)
}

In [None]:
PlotCorr(PooledCommonHDL_sig, PooledHDLCor, "HDL")

In [None]:
PlotCorr(PooledCommonLDL_sig, PooledLDLCor, "LDL")

In [None]:
PlotCorr(PooledCommonTC_sig, PooledTCCor, "TC")

In [None]:
PlotCorr(PooledCommonTG_sig, PooledTGCor, "TG")

# Variants present only in Pooled results and not in Meta-analysis results

In [None]:
Only_HDLpooled <- HDLpooled[match(setdiff(HDLpooled$ID, HDLmetal$MarkerName), HDLpooled$ID), ]
Only_LDLpooled <- LDLpooled[match(setdiff(LDLpooled$ID, LDLmetal$MarkerName), LDLpooled$ID), ]
Only_TCpooled <- TCpooled[match(setdiff(TCpooled$ID, TCmetal$MarkerName), TCpooled$ID), ]
Only_TGpooled <- TGpooled[match(setdiff(TGpooled$ID, TGmetal$MarkerName), TGpooled$ID), ]


In [None]:
length(na.omit(setdiff(HDLpooled$ID, HDLmetal$MarkerName)))

In [None]:
head(Only_TGpooled)

In [None]:
dim(Only_HDLpooled)
dim(Only_LDLpooled)
dim(Only_TCpooled)
dim(Only_TGpooled)

In [None]:
plot_histograms <- function(data, name) {

    options(repr.plot.width = 16, repr.plot.height = 10)
    
    ggplot(data, 
           aes(x=log10(A1FREQ)))+
           geom_histogram(bins = 30)+
    labs(title = str_glue(name),
             caption = PLOT_SUBTITLE)
 }

In [None]:
grid.arrange(
    plot_histograms(data = HDLpooled, name = 'Variants in HDLPooled'),
    plot_histograms(data = Only_HDLpooled, name = str_glue('Varinats only in HDLpooled \nand not in Meta-analysis')),
    ncol = 2,
    top = 'HDL')

In [None]:
grid.arrange(
    plot_histograms(data = LDLpooled, name = 'Variants in LDLPooled'),
    plot_histograms(data = Only_LDLpooled, name = str_glue('Varinats only in LDLpooled \nand not in Meta-analysis')),
    ncol = 2,
    top = 'LDL')

In [None]:
grid.arrange(
    plot_histograms(data = TCpooled, name = 'Variants in TCPooled'),
    plot_histograms(data = Only_TCpooled, name = str_glue('Varinats only in TCpooled \nand not in Meta-analysis')),
    ncol = 2,
    top = 'TC')

In [None]:
grid.arrange(
    plot_histograms(data = TGpooled, name = 'Variants in TGPooled'),
    plot_histograms(data = Only_TGpooled, name = str_glue('Varinats only in TGpooled \nand not in Meta-analysis')),
    ncol = 2,
    top = 'TG')

# Comparisons against other lipids studies

## Comparison with UKB published GWAS summary

##### Rare coding variants in 35 genes associate with circulating lipid levels – a multi-ancestry analysis of 170,000 exomes. [Hindy et al 2021](https://www.biorxiv.org/content/10.1101/2020.12.22.423783v1.supplementary-material?versioned=true)

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/09/01/2020.12.22.423783/DC2/embed/media-2.xlsx?download=true', 'hindy.xlsx')

In [None]:
combined_hindy_results <- read_xlsx('hindy.xlsx', sheet = 'Table_S11', skip = 1, na = 'NA') %>%
    filter(Ancestry == 'Overall') %>%
    mutate(
        lipid_type = case_when(
            Trait == 'LDL_ADJ' ~ 'LDL',
            Trait == 'TOTAL_ADJ' ~ 'TC',
            TRUE ~ Trait
        )
    )

dim(combined_hindy_results)

In [None]:
head(combined_hindy_results)

In [None]:
PlotCorr <- function(Data, CorVal, CorPval, Nsamp, Title){
ggplot(Data, aes(x = FlipBeta, y = BETA_FE)) +
    geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x =max(Data$FlipBeta),#
                 y =min(Data$BETA_FE),#
                 hjust = 'right',
                 vjust = 0,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('R-square: ',CorVal, '\nN: ', Nsamp, '\nP-value: ', CorPval))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue(Title,': meta-analysis vs Hindy GWAS results'),
             x='Meta-analysis Effects',
             y='Hindy GWAS Effects',
             caption = PLOT_SUBTITLE)
}

In [None]:
Set1 <- merge(HDLmetal_trimmed, combined_hindy_results%>%filter(lipid_type == "HDL"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA_FE)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA_FE)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "HDL")

In [None]:
t(Set1)

In [None]:
Set1 <- merge(LDLmetal_trimmed, combined_hindy_results%>%filter(lipid_type == "LDL"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA_FE)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA_FE)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "LDL")

In [None]:
Set1 <- merge(TCmetal_trimmed, combined_hindy_results%>%filter(lipid_type == "TC"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA_FE)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA_FE)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "TC")

In [None]:
Set1 <- merge(TGmetal_trimmed, combined_hindy_results%>%filter(lipid_type == "TG"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA_FE)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA_FE)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "TG")

## Comparison with TOPMed (Freeze8) Lipid GWAS

##### Whole genome sequence analysis of blood lipid levels in >66,000 individuals. [Selvaraj et al 2021](https://www.biorxiv.org/content/10.1101/2021.10.11.463514v1.supplementary-material)

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/10/12/2021.10.11.463514/DC1/embed/media-1.xlsx?download=true', 'selvaraj.xlsx')

In [None]:
selvaraj_tables = c(HDL = 'A4:L361', LDL = 'A363:L701', TC = 'A703:L1027', TG = 'A1029:L1318')
LIPIDS <- c("HDL", "LDL", "TC", "TG")
combined_selvaraj_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        # Print some metadata for an eyeball check that we are associating the data with the correct lipid type.
        print(str_glue('{lipid} {selvaraj_tables[lipid]}'))
        first_row = as.integer(str_extract(selvaraj_tables[lipid], '\\d+'))
        print(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = str_glue('A{first_row - 1}:A{first_row}')))
        print(nrow(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid])))
        
        # Retrieve the data.
        read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid]) %>%
        mutate(
            # Work around a bad entry in the data causing the p.value column to be of type character.
            p.value = as.numeric(p.value),
            RSID = paste0(CHR, ':' , POS, ':', Allele1, ':', Allele2),
            lipid_type = lipid
        )
    }))

dim(combined_selvaraj_results)

In [None]:
head(combined_selvaraj_results)

In [None]:
PlotCorr <- function(Data, CorVal, CorPval, Nsamp, Title){
ggplot(Data, aes(x = FlipBeta, y = BETA)) +
    geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x =max(Data$FlipBeta),#
                 y =min(Data$BETA),#
                 hjust = 'right',
                 vjust = 0,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('R-square: ',CorVal, '\nN: ', Nsamp, '\nP-value: ', CorPval))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue(Title,': meta-analysis vs Selvaraj GWAS results'),
             x='Meta-analysis Effects',
             y='Selvaraj GWAS Effects',
             caption = PLOT_SUBTITLE)
}

In [None]:
Set1 <- merge(HDLmetal_trimmed, combined_selvaraj_results%>%filter(lipid_type == "HDL"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "HDL")

In [None]:
Set1 <- merge(LDLmetal_trimmed, combined_selvaraj_results%>%filter(lipid_type == "LDL"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "LDL")

In [None]:
Set1 <- merge(TCmetal_trimmed, combined_selvaraj_results%>%filter(lipid_type == "TC"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "TC")

In [None]:
Set1 <- merge(TGmetal_trimmed, combined_selvaraj_results%>%filter(lipid_type == "TG"), by.x="MarkerName", by.y="RSID")
CorTest<-cor.test(Set1$FlipBeta, Set1$BETA)
Cor <- round(cor(Set1$FlipBeta, Set1$BETA)^2, digits=2)
Pval <- scientific(CorTest$p.value, digits = 2)
N <- nrow(Set1)
PlotCorr(Set1, Cor, Pval, N, "TG")