# Run MOFA analysis in Teaseq data
- CD4 naive cells ARI vs CON2

1. preprocess clinical lab, olink, scRNA, scATAC data
    - extract the sample pseudobulk level data
    - for atac-seq data, TF activivies infered from chromVAR is used
2. run MOFA
3. examine MOFA factor 1

# set up

In [None]:
# import libraries
quiet_library <- function(...) {
    suppressPackageStartupMessages(library(...))
}
quiet_library("tidyverse")
quiet_library("Matrix")
quiet_library("viridis")
quiet_library("scran")
quiet_library("scater")
quiet_library("MOFA2")
quiet_library("data.table")
quiet_library("jsonlite")
quiet_library("parallel")
quiet_library("Seurat")
quiet_library("ggpubr")


In [None]:
# define file path
fig_path <- "/home/jupyter/figures/preRA_teaseq/MOFA"
data_path <- "/home/jupyter/data/preRA_teaseq/EXP-00243"
meta_path <- "/home/jupyter/data/preRA_teaseq/meta_data"
output_path <- "/home/jupyter/data/preRA_teaseq/output_results/MOFA"
if (!dir.exists(fig_path)) (dir.create(fig_path, recursive = TRUE))
if (!dir.exists(output_path)) (dir.create(output_path, recursive = TRUE))
# define a project name
proj_name <- "PreRA_teaseq_MOFA_cd4na"

In [None]:
# define the color palette to be used
npg_color <- c(
    "#E64B35FF", "#4DBBD5FF", "#00A087FF", "#3C5488FF", "#F39B7FFF",
    "#8491B4FF", "#91D1C2FF", "#DC0000FF", "#7E6148FF", "#B09C85FF"
)
nejm_color <- c("#BC3C29FF", "#0072B5FF", "#E18727FF", "#20854EFF", "#7876B1FF", "#6F99ADFF", "#FFDC91FF", "#EE4C97FF")
jama_color <- c("#374E55FF", "#DF8F44FF", "#00A1D5FF", "#B24745FF", "#79AF97FF", "#6A6599FF", "#80796BFF")
jco_color <- c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF", "#7AA6DCFF", "#003C67FF", "#8F7700FF")
cluster_colors <- c(
    "#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72", "#B17BA6", "#FF7F00", "#FDB462", "#E7298A",
    "#E78AC3", "#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D", "#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999",
    "#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000", "#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00"
)
con_ari_colors <- c("#5AAA46", "#F59F00")
cluster_colors_ext <- colorRampPalette(cluster_colors)(36)
options(repr.plot.width = 20, repr.plot.height = 15)


In [None]:
# source the helper functions
source("/home/jupyter/github/Teaseq-analysis/scRNA_teaseq_ananlysis_helper_functions.r")


# Load and prepare Data

In [None]:
# load metadat in 
meta_data <- read_csv('/home/jupyter/data/preRA_teaseq/meta_data/PreRA_teaseq_metadata.csv')
meta_data %>% colnames()
meta_data <- meta_data %>% dplyr::select(all_of(c('subject.subjectGuid', 'sample.sampleKitGuid',
                                                   'subject.biologicalSex', 'cohort.cohortGuid','age',
                                                  'rheumatoid_factor_iga','rheumatoid_factor_igm',
                                                   'anti_ccp3'))) %>%
    dplyr::rename('sample'='subject.subjectGuid') %>% 
    mutate(cohort=if_else(cohort.cohortGuid=='CU1', 'at_risk', 'healthy')) 
meta_data %>% head()

## Psudobulk approach

### load psudobulk rna, adt, peaks, 

In [None]:
# load rna psudobulk data
# gene expression matrix is summarized at the sample level and vst transformed
cd4na_rna_psudo <- readRDS('/home/jupyter/data/preRA_teaseq/EXP-00243/PreRA_teaseq_cd4na_t_rna_subject_PseudoBulk.rds')

In [None]:
cd4na_rna_psudo

In [None]:
# check the distribution of the gene expression data
assay(cd4na_rna_psudo, 'normalized_counts') %>% hist()

In [None]:
# extract the rna data into the long table formate and scale the data per MOFA requirement.
norm_rna_counts <- assay(cd4na_rna_psudo, 'normalized_counts')
colnames(norm_rna_counts) <- cd4na_rna_psudo$subject_id
norm_rna_counts_long <- norm_rna_counts %>% as_tibble(rownames = 'gene') %>% 
        pivot_longer(cols = -gene, names_to = 'subject_id', values_to = 'value') %>% 
        mutate(view='rna',  feature=paste('rna', gene, sep = '_')) %>% 
    group_by(feature) %>% 
    mutate(value=scale(value)) %>% ungroup() 
norm_rna_counts_long %>% head()

In [None]:
# check if the data is scaled
norm_rna_counts_long %>% ggplot(aes(x=value, fill=subject_id)) + geom_histogram(alpha=0.5)

In [None]:
# load adt psudobulk
# adt data is summarized at the sample level and similar to rna data, it is scaled per MOFA requirement
cd4na_adt_psudo <- readRDS('/home/jupyter/data/preRA_teaseq/EXP-00243/PreRA_teaseq_cd4na_t_adt_subject_PseudoBulk.rds')
cd4na_adt_psudo

In [None]:
# extract the rna data for all cell type
norm_adt_counts <- assay(cd4na_adt_psudo, 'norm_counts')
colnames(norm_adt_counts) <- cd4na_adt_psudo$subject_id
norm_adt_counts_long <- norm_adt_counts %>% as_tibble(rownames = 'gene') %>% 
        pivot_longer(cols = -gene, names_to = 'subject_id', values_to = 'value') %>% 
        mutate(view='adt', feature=paste('adt', gene, sep = '_')) %>% 
    group_by(feature) %>%  mutate(value=scale(value)) %>% ungroup()
norm_adt_counts_long %>% head()

In [None]:
norm_adt_counts_long %>%    
    ggplot(aes(x=value, fill=subject_id)) + 
    facet_wrap(vars(feature)) +
    geom_histogram(alpha=0.5)

In [None]:
# save the differential results
l2_chromvar <- readRDS("/home/jupyter/data/preRA_teaseq/output_results/atac/preRA_teaseq_MOCHA_l2_celltype_chromavar.rds")
cd4na_dev <- l2_chromvar[["cd4_naive"]]$Z_Score
cd4na_dev

In [None]:
# remove one motif with empty values
cd4na_dev <- cd4na_dev[rownames(cd4na_dev) != "ENSG00000250542", ]
colnames(cd4na_dev) <- colData(cd4na_dev)$subject_id
cd4na_dev


In [None]:
# extract the z score
chromvar_tf <- assay(cd4na_dev, 'cd4_naive') %>% 
    as_tibble(rownames = 'tf') %>% 
    pivot_longer(cols = -tf, names_to = 'subject_id', values_to = 'value')%>% 
        mutate(view='tf',  feature=paste('tf', tf, sep = '_')) 
chromvar_tf %>% head()

In [None]:
chromvar_tf %>% ggplot(aes(x=value, fill=subject_id)) + 
    geom_histogram(alpha=0.5) 

### get OLINK data

In [None]:

# load the data that processed by Mark contains all olink data
olink_processed <- fread('/home/jupyter/data/olink/cache/9de67d6a-9cf8-4031-8ed2-ae0fcf3e40d5/2022-11-18_Olink_allCohorts_v2.csv')


In [None]:
# filter the olink data to all 13 samples in teaseq
olink_fl <-  olink_processed %>% filter(sample.sampleKitGuid %in% meta_data$sample.sampleKitGuid) %>% 
    dplyr::select(SampleID:subject.subjectGuid) %>%
    distinct(Assay, sample.sampleKitGuid, .keep_all = TRUE)

In [None]:
olink_fl$NPX_Final %>% hist()

In [None]:
# check feature number in OLINK assay
olink_fl %>% distinct(Assay) %>% nrow()

In [None]:
# make a feature table for olink data for moda
olink_mofa <- olink_fl %>%
    dplyr::select(Assay, sample.sampleKitGuid, NPX_Final, subject.subjectGuid) %>%
    dplyr::rename("feature" = "Assay", "value" = "NPX_Final", "subject_id" = "subject.subjectGuid") %>%
    mutate(view = "olink") %>%
    mutate(feature = paste(view, feature, sep = "_"))


# create MOFA object

In [None]:
# creat a lone data table of all modalities combine
mofa_dt <- rbindlist(list(
    norm_rna_counts_long %>% dplyr::select(c(subject_id, value, view, feature)),
    norm_adt_counts_long %>% dplyr::select(c(subject_id, value, view, feature)),
    olink_mofa %>% dplyr::select(c(subject_id, value, view, feature)),
    chromvar_tf %>% dplyr::select(c(subject_id, value, view, feature))
), use.names = TRUE) %>%
    dplyr::rename("sample" = "subject_id") %>%
    as_tibble()
mofa_dt %>% head()
# check how many features are in each view
mofa_dt %>%
    group_by(view) %>%
    distinct(feature) %>%
    tally()

In [None]:
# remove subject BR2024 from the analysis
# this con2 subject had a positive CCP result and was removed from the analysis
mofa_dt <- mofa_dt %>% filter(sample!='BR2024')

In [None]:
# save MOFA data table
mofa_dt %>% write_tsv(file.path(output_path, 'mofa_preRA_cd4_rna_adt_olink_tf_pseudobulk_data_092023.tsv'))

In [None]:
# load the data table
mofa_dt <- read_tsv(file.path(output_path, 'mofa_preRA_cd4_rna_adt_olink_tf_pseudobulk_data_071423.tsv'))

In [None]:
# create a MOFA object
MOFAobject <- create_mofa(mofa_dt)

In [None]:
print(MOFAobject)
plot_data_overview(MOFAobject)
ggsave(file.path(fig_path, paste0(proj_name, '_data_overview.png')), 
       width=6, height=6)

In [None]:
data_opts <- get_default_data_options(MOFAobject)
data_opts$scale_views=TRUE

In [None]:
model_opts <- get_default_model_options(MOFAobject)
# set the number of factors to 6
model_opts$num_factors <- 6
head(model_opts)

In [None]:
train_opts <- get_default_training_options(MOFAobject)
head(train_opts)

In [None]:
MOFAobject <- prepare_mofa(
  object = MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

# train mofa model

In [None]:
# train mofa model
outfile <- file.path(output_path, "mofa_preRA_cd4_rna_adt_olink_tf_pseudobulk_model_09202023.hdf5")
MOFAobject.trained <- run_mofa(MOFAobject, outfile, use_basilisk = TRUE)


# downstream analysis

In [None]:
# load the trained model
model <- load_model(file.path(output_path, "mofa_preRA_cd4_rna_adt_olink_tf_pseudobulk_model_09202023.hdf5"))

In [None]:
plot_data_overview(model)
ggsave(file.path(fig_path, paste0(proj_name, '_data_overview.png')), 
       width=4, height=4)

In [None]:
# add metadata to the model
meta_data <- meta_data %>% filter(sample!='BR2024')%>%
    mutate(status=factor(recode(cohort,'at_risk'='ARI', 'healthy'='CON2'),
                        levels=c('CON2', 'ARI')))
meta_data

In [None]:
# add metadata to the model
samples_metadata(model) <- meta_data

In [None]:
# extract the facot 
factors_values <- get_factors(model, 
  factors = "all", 
  as.data.frame = TRUE
) %>% as_tibble() %>% left_join(samples_metadata(model), by='sample') %>%
    mutate(cohort=factor(cohort, levels = c('healthy', 'at_risk')))
head(factors_values)


In [None]:
# check if the factors are correlated with each other
plot_factor_cor(model)

In [None]:
# Variance explained for every factor in per view and group
variance_exp <- model@cache$variance_explained$r2_per_factor[[1]] %>% as_tibble(rownames = "factor")
variance_exp %>% head()


In [None]:
# plot the total variance explained
total_variance <- calculate_variance_explained(model)$r2_total$single_group %>%
    as_tibble(rownames = "modality") %>%
    dplyr::rename("variance" = "value")
total_variance %>% ggbarplot(x = "modality", y = "variance", fill = "steelblue")


In [None]:
calculate_variance_explained(model)

In [None]:
# plot the variance explained by factor 1
f1_variance <- calculate_variance_explained(model)$r2_per_factor$single_group %>%
    as_tibble(rownames = "factor") %>%
    filter(factor == "Factor1") %>%
    pivot_longer(cols = -factor, names_to = "modality", values_to = "variance") %>%
    mutate(modality = factor(modality, levels = c("olink", "rna", "tf", "adt")))
f1_variance %>% ggbarplot(
    x = "modality", y = "variance", ylab = "", xlab = "", title = "% variance \nexplained by F1",
    fill = "modality", legend = "none", rotate = TRUE,
    palette = npg_color
)
ggsave(file.path(fig_path, paste0(proj_name, "_variance_decomposition_f1.pdf")), width = 3, height = 3)


In [None]:
# plot the variance in each modality explained by the mofa factors
p1 <- plot_variance_explained(model, x = "view", y = "factor") +
    scale_x_discrete(labels = c(
        "adt" = "ADT", "olink" = "OLINK",
        "rna" = "RNA", "tf" = "ATAC (tf)"
    )) +
    scale_y_discrete(labels = c(
        "Factor1" = "F1", "Factor2" = "F2", "Factor3" = "F3",
        "Factor4" = "F4", "Factor5" = "F5", "Factor6" = "F6"
    )) +
    theme(
        axis.text.x = element_text(
            size = 16, angle = 45, hjust = 1
        ),
        axis.text.y = element_text(
            size = 16
        )
    )
p1
ggsave(file.path(fig_path, paste0(proj_name, "_variance_decomposition_factor_heatmap.pdf")), width = 3, height = 3)


In [None]:
# plot the variaiance explained
p1 <- plot_variance_explained(model, x = "view", y = "factor")
p1 + ylab("Factors") + xlab("Modalities") + scale_x_discrete(labels = c("ADT", "Plasma\nProtein", "RNA", "TF")) +
    theme(
        axis.title = element_text(size = 24),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank()
    )
ggsave(file.path(fig_path, paste0(proj_name, "_variance_decomposition_facoter_modality.pdf")), width = 4, height = 4)


In [None]:
plot_variance_explained(model, x = "group", y = "factor", plot_total = T)[[2]]
ggsave(file.path(fig_path, paste0(proj_name, "_total_variance_explained.pdf")), width = 4, height = 4)


In [None]:
# run glm ARI vs healthy
glm_test <- function(data, formula) {
    glm_res <- broom::tidy(stats::glm(as.formula(formula), data = data))
    return(glm_res)
}
stats_glm <- factors_values %>%
    mutate(status = factor(status, levels = c("CON2", "ARI"))) %>%
    group_by(factor) %>%
    group_modify(~ glm_test(.x, formula = "value ~ status + age"))
stats_glm %>%
    filter(term != "(Intercept)") %>%
    rstatix::adjust_pvalue(p.col = "p.value", method = "BH") %>%
    arrange(p.value.adj) %>%
    filter(term == "statusARI")
# factor 1 significant different between ARI and healthy

In [None]:
# plot factor1
factors_values %>%
    filter(factor == "Factor1") %>%
    ggpubr::ggboxplot(
        x = "status", y = "value", add = "jitter",
        color = "status", palette = con_ari_colors
    ) +
    # ggpubr::stat_compare_means() +
    NoLegend() + ggtitle("F1") +
    theme(
        plot.title = element_text(hjust = 0.5, size = 16),
        axis.text.x = element_text(size = 16),
        axis.title.y = element_text(size = 16),
        axis.text.y = element_text(size = 16)
    ) + xlab("") + ylab("Factor score")

ggsave(file.path(fig_path, paste0(proj_name, "_factor1_status.pdf")), width = 3, height = 3)


In [None]:
# plot other factors for comparison
factors_values %>%
  filter(factor != "Factor1") %>%
  ggpubr::ggviolin(
    x = "status", y = "value", add = "dotplot",
    color = "status", palette = con_ari_colors
  ) +
  # ggpubr::stat_compare_means() +ß
  NoLegend() + # ggtitle('Factor 1')+
  theme(plot.title = element_text(hjust = 0.5)) + xlab("") + ylab("Factor score") + facet_wrap(vars(factor))
ggsave(file.path(fig_path, paste0(proj_name, "_notsig_factor_status.pdf")), width = 4, height = 4)


## analyze Factor1

In [None]:
# plot the top features of foctor1
factor <- 1
p1 <- plot_weights(model,
  view = "rna",
  factor = factor,
  nfeatures = 10, # Number of features to highlight
  scale = T, # Scale weights from -1 to 1
  abs = F # Take the absolute value?
)
p2 <- plot_weights(model,
  view = "adt",
  factor = factor,
  nfeatures = 10, # Number of features to highlight
  scale = T, # Scale weights from -1 to 1
  abs = F # Take the absolute value?
)
p3 <- plot_weights(model,
  view = "olink",
  factor = factor,
  nfeatures = 10, # Number of features to highlight
  scale = T, # Scale weights from -1 to 1
  abs = F # Take the absolute value?
)
p4 <- plot_weights(model,
  view = "tf",
  factor = factor,
  nfeatures = 10, # Number of features to highlightß
  scale = T, # Scale weights from -1 to 1
  abs = F # Take the absolute value?
)
cowplot::plot_grid(p1, p2, p3, p4, nrow = 2)
ggsave(file.path(fig_path, paste0(proj_name, "_cd4na_factor1_weights.png")),
  width = 12, height = 8
)


In [None]:
# extract the weight of the models
weight <- get_weights(model, views='all',  as.data.frame = TRUE) %>% 
    mutate(direction=if_else(value>0, 'up', 'down'))
weight %>% head()

In [None]:
# save feature weights
weight %>% write_tsv(file.path(output_path, 'Mofa_preRA_cd4na_09202023_factor_weights.tsv'))

In [None]:
# plot dot plot of top features
f1_top_weights <- weight %>%
  filter(factor == "Factor1") %>%
  group_by(view) %>%
  slice_max(order_by = value, n = 10) %>%
  arrange(desc(value)) %>%
  mutate(
    view = recode(view,
      "rna" = "RNA", "tf" = "Transcription Factor",
      "adt" = "Surface protein", "olink" = "plasma protein"
    ),
    feature = str_split(feature, pattern = "_", simplify = TRUE)[, 2]
  )
ggdotchart(f1_top_weights,
  x = "feature", y = "value",
  color = "view", # Color by groups
  palette = npg_color, # Custom color palette
  sorting = "descending", # Sort value in descending order
  rotate = TRUE, # Rotate vertically
  dot.size = 3, # Large dot size
  y.text.col = TRUE, # Color y text by groups
  ggtheme = theme_pubr() # ggplot2 theme
) +
  theme_cleveland()
ggsave(file.path(fig_path, paste0(proj_name, "_cd4na_factor1_weights_dotplot.pdf")),
  width = 6, height = 8
)


### check TF acitivity associated with factor 1

In [None]:
# plot dot plot of top features
f1_tf_weights <- weight %>%
    filter(factor == "Factor1" & view == "tf") %>%
    group_by(direction) %>%
    arrange(desc(value)) %>%
    mutate(`Transcription factors` = str_remove(if_else(feature %in% c("tf_NFATC3", "tf_NFATC2", "tf_NFATC1", "tf_NFATC4"),
        "NFATs", "Other TFs"
    ), "tf_")) %>%
    mutate(
        view = recode(view,
            "rna" = "RNA", "tf" = "Transcription Factor",
            "adt" = "Surface protein", "olink" = "plasma protein"
        ),
        feature = str_split(feature, pattern = "_", simplify = TRUE)[, 2]
    ) %>%
    mutate(
        `Transcription Factor Activity` = factor(if_else(direction == "up", "Enriched in ARI", "Enriched in Controls"),
            levels = c("Enriched in Controls", "Enriched in ARI")
        ),
        rank = rank(value, )
    )
f1_tf_weights %>% head()
# f1_tf_weights%>%ggplot(aes(x=value, y=))


#### plot fig 

In [None]:
# plot dot plot of top features
options(repr.plot.width = 5, repr.plot.height = 5)
f1_top_tf_weights <- weight %>%
  filter(factor == "Factor1" & view == "tf") %>%
  group_by(direction) %>%
  slice_max(order_by = abs(value), n = 15) %>%
  arrange(desc(value)) %>%
  mutate(`Transcription factors` = str_remove(if_else(feature %in% c("tf_NFATC3", "tf_NFATC2", "tf_NFATC1", "tf_NFATC4"),
    "NFATs", "Other TFs"
  ), "tf_")) %>%
  mutate(
    view = recode(view,
      "rna" = "RNA", "tf" = "Transcription Factor",
      "adt" = "Surface protein", "olink" = "plasma protein"
    ),
    feature = str_split(feature, pattern = "_", simplify = TRUE)[, 2]
  ) %>%
  mutate(`TF Activity` = factor(if_else(direction == "up", "Enriched in ARI", "Enriched in Controls"),
    levels = c("Enriched in Controls", "Enriched in ARI")
  ))

ggdotchart(f1_top_tf_weights,
  x = "feature", y = "value",
  color = "TF Activity", # Color by groups
  palette = con_ari_colors, # Custom color palette
  sorting = "descending", # Sort value in descending order
  rotate = TRUE, # Rotate vertically
  dot.size = 3, # Large dot size
  y.text.col = TRUE, # Color y text by groups
  # ggtheme = theme_bw(),                    # ggplot2 theme
  ylab = "Weight associated with Factor 1"
) +
  theme_cleveland() + geom_hline(yintercept = 0, linetype = 2) +
  theme(
    legend.position = "top", legend.box = "vertical", legend.margin = margin(),
    axis.text.y = element_text(color = "black")
  ) + guides(color = guide_legend(nrow = 2))
ggsave(file.path(fig_path, paste0(proj_name, "_cd4na_factor1_tf_weights_dotplot.pdf")),
  width = 5, height = 5
)


In [None]:
### plot fig S7D

In [None]:
p1 <- chromvar_tf %>%
    mutate(status = factor(if_else(str_detect(subject_id, "CU"), "ARI", "CON2"),
        levels = c("CON2", "ARI")
    )) %>%
    filter(tf %in% c("NFATC3", "NFATC1", "NFATC2", "FOXP3")) %>%
    ggpubr::ggboxplot(
        x = "status", y = "value", palette = con_ari_colors,
        ylab = "Chromvar Z scores", xlab = "",
        color = "status", add = "jitter"
    ) +
    ggpubr::stat_compare_means(
        method = "wilcox.test", label.y = 2.5,
        aes(label = paste0("p = ", after_stat(p.format)))
    ) +
    facet_wrap(vars(tf), ncol = 2)
p1
ggsave(file.path(fig_path, paste0(proj_name, "_NFATs_Chromvar_zscores.pdf")), width = 4, height = 4)


In [None]:
# plot f1 tf values
f1_tf_values <- weight %>%
  filter(view == "tf" & factor == "Factor1") %>%
  arrange(desc(value)) %>%
  mutate(
    rank = 1:length(value),
    label_feature = if_else(str_remove(feature, "tf_") %in% c("NFATC3", "NFATC2", "NFATC1", "NFATC4", "FOXP3", "BATF3", "EGR2"),
      feature, NA
    )
  )
f1_tf_values %>% ggplot(aes(x = value, y = rank, label = label_feature, color = label_feature)) +
  ggrepel::geom_text_repel(size = 6, nudge_x = 0.1) +
  geom_vline(xintercept = 0.4) +
  scale_y_reverse() +
  geom_point(size = 0.8) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  )
ggsave(
  file.path(fig_path, paste0(
    proj_name,
    "_cd4na_factor1_TF_weights_NFAT.pdf"
  )),
  width = 4, height = 4
)


In [None]:
# check the Natural cutoff for tf
p1 <- plot_weights(model,
  view = "tf",
  factor = 1,
  manual = c("tf_NFATC3", "tf_NFATC2", "tf_NFATC1", "tf_NFATC4"),
  text_size = 4,
  nfeatures = 20, # Number of features to highlight
  scale = T, # Scale weights from -1 to 1
  abs = F # Take the absolute value?
)

p1 + geom_vline(xintercept = 0.4) + ggrepel::geom_label_repel(aes(label = feature)) #+ scale_color_manual(values = cluster_colors)
ggsave(
  file.path(fig_path, paste0(
    proj_name,
    "_cd4na_factor1_TF_weights.pdf"
  )),
  width = 4, height = 4
)


In [None]:
factor1_tf <- weight %>%
    filter(factor == "Factor1" & view == "tf") %>%
    mutate(direction = if_else(value > 0, "healthy", "at-risk")) %>%
    arrange(desc(abs(value))) %>%
    filter(!str_detect(feature, "ENSG|DUX"))
# factor1_tf %>% group_by(direction) %>% slice_max(order_by = abs(value),n = 60)


In [None]:
plot_data_scatter(model,
     factor = 1,
     features = c(
          "tf_NFATC3", "tf_STAT5A", "tf_STAT3", "tf_JUND", "tf_BCL6", "tf_FOS", "tf_JUNB",
          "tf_BATF"
     ),
     view = "tf", color_by = "cohort"
)
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_tf_correlation_cohort_nfat_partners.png")),
     width = 8, height = 4
)


In [None]:
# get factor 1 and tf data
f1_tf_data <- get_data(model,
    as.data.frame = TRUE,
    view = "tf"
) %>%
    mutate(feature = str_remove(feature, "tf_")) %>%
    dplyr::rename("TF_activity" = "value") %>%
    left_join(filter(factors_values, factor == "Factor1") %>%
        dplyr::rename("Factor_score" = "value"), by = "sample")
f1_tf_data %>% head()


In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
f1_tf_data_nfat <- f1_tf_data %>% filter(feature %in% c("FOXP3", "NFATC1", "NFATC2", "NFATC3"))
ggpubr::ggscatter(f1_tf_data_nfat,
       x = "Factor_score", y = "TF_activity",
       xlab = "Factor1 score", ylab = "ChromVAR activity",
       color = "status", facet.by = "feature", palette = con_ari_colors,
       add = "reg.line", # Add regressin line
       add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
       conf.int = TRUE, # Add confidence interval
       cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
       cor.coeff.args = list(method = "spearman", label.x = -0.8, label.y = 1.5, label.sep = "\n"),
       ggtheme = theme_classic2()
) + scale_fill_manual(values = con_ari_colors) + theme(legend.position = "top")
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_tf_correlation_cohort_nfat_foxp3.pdf")),
       width = 5, height = 5
)


### check RNA

In [None]:
factor1_rna <- weight %>%
    filter(factor == "Factor1" & view == "rna") %>%
    mutate(direction = if_else(value > 0, "healthy", "at-risk")) %>%
    arrange(desc(abs(value)))
factor1_rna_sel <- factor1_rna %>%
    group_by(direction) %>%
    slice_max(order_by = abs(value), n = 50)


In [None]:
p1 <- plot_data_scatter(model,
      factor = 1,
      features = c("rna_STIM1", "rna_STIM2"),
      view = "rna", color_by = "cohort"
)
p1
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_rna_correlation_Ca_channel.png")),
      width = 6, height = 3
)


In [None]:
# check il
il_rna <- factor1_rna %>%
       filter(str_detect(feature, "IL\\d|IFN.R|TGF|CD28|CD69") &
              abs(value) > 0.6 & feature != "rna_PPIL4")
il_rna
p1 <- plot_data_scatter(model,
       factor = 1,
       features = il_rna$feature %>% as.character(),
       view = "rna", color_by = "cohort"
)
p1
# il_rna <- factor1_rna %>%
#     filter(str_detect(feature, 'IL\\d|IFN.R|TGF|CD28|CD69')&
#            abs(value)>0.5)
# p2 <- plot_data_scatter(model, factor=1,
#                   features = il_rna$feature %>% as.character(),
#                         view='rna', color_by='cohort')
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_rna_correlation_cohort_il_receptor.png")),
       width = 8, height = 6
)


In [None]:
# check exhaustion
exh_rna <- factor1_rna %>%
       filter(str_detect(feature, "IL\\d|IFN.R|TGF|CD28|CD69") &
              abs(value) > 0.6 & feature != "rna_PPIL4")
il_rna
p1 <- plot_data_scatter(model,
       factor = 1,
       features = il_rna$feature %>% as.character(),
       view = "rna", color_by = "cohort"
)
p1
# il_rna <- factor1_rna %>%
#     filter(str_detect(feature, 'IL\\d|IFN.R|TGF|CD28|CD69')&
#            abs(value)>0.5)
# p2 <- plot_data_scatter(model, factor=1,
#                   features = il_rna$feature %>% as.character(),
#                         view='rna', color_by='cohort')
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_rna_correlation_cohort_il_receptor.png")),
       width = 8, height = 6
)


In [None]:
# plot ca2+ related gene expression
tcr_gene <- paste0("rna_", c(
       "VAV1", "GRB2", "GRAP2", "NFATC2", "NFAT5", "CBL",
       "CARD11", "LCK", "IKBKB", "CD4", "PIK3R1", "MALT1",
       "RAF1", "SOS1", "NCK2", "CD3G", "LCP2", "LAT", "NCK1", "CD3D", "CD3E", "RASGRP1", "MAP2K1",
       "PTPRC", "NFATC3", "MAP3K7", "PPP3CC", "PPP3CA", "GSK3B"
))
factor1_rna %>%
       filter(str_detect(feature, "PPP3CC|PLCG2|CALM1") | feature %in% tcr_gene) %>%
       filter(abs(value) > 0.6)

p1 <- plot_data_scatter(model,
       factor = 1,
       features = paste0("rna_", c("PPP3CC", "PLCG2", "PPP3CA", "LCK", "CD4", "CD3G", "CD3D")),
       view = "rna", color_by = "cohort"
)
p1
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_rna_correlation_cohort_ca_signaling_pos.png")),
       width = 8, height = 6
)
p2 <- plot_data_scatter(model,
       factor = 1,
       features = paste0("rna_", c("CABIN1", "CSNK1A1")),
       view = "rna", color_by = "cohort"
)
p2
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_rna_correlation_cohort_ca_signaling_neg.png")),
       width = 6, height = 3
)


In [None]:
# check NFAT gene expression
# factor1_rna %>%  filter(str_detect(feature,'NFAT|FOXP3'))

# c('rna_STAT3',  'rna_NFATC3', 'rna_BATF') %in% weight$feature
plot_data_scatter(model,
      factor = 1,
      features = factor1_rna %>% filter(str_detect(feature, "NFAT|FOXP3")) %>% pull(feature) %>% as.character(),
      view = "rna", color_by = "cohort"
)
ggsave(file.path(fig_path, paste0(proj_name, "_facoter1_rna_correlation_cohort_NFATs.png")),
      width = 8, height = 4
)


In [None]:
# change the row names for plotting
rownames(model@data$rna$single_group)<-rownames(model@data$rna$single_group)%>%str_remove('rna_')

In [None]:
# plot heatmap results for deg psudobulk
PlotDegHeatmap <- function(pseudo, genes, gene_meta = NULL, genes_hightlight = NULL,
                           assay = "normalized_counts", celltype_col,
                           celltype_colors = cluster_colors, batch_colors = cluster_colors_ext, scale = TRUE) {
    require("ComplexHeatmap")

    # set up the data matrix
    gex_matrix <- assay(pseudo, assay)[genes, ] %>% as.matrix()

    # set up the column annotation
    metadata <- colData(pseudo) %>% as.data.frame()
    # set up column annotation
    col_anno <- rowAnnotation(
        df = metadata %>% dplyr::select(c(status)) %>%
            as.data.frame(),
        col = list(status = c("CON2" = con_ari_colors[1], "ARI" = con_ari_colors[2])),
        annotation_legend_param =
            list(
                status = list(direction = "horizontal") # ,
                # title_gp = gpar(fontsize = 12),
                # label_gp = gpar(fontsize = 12),
                # grid_height = unit(1, "cm"),
                #    legend_height = unit(1, "cm"),
            )
    )
    # set up row annotation - gene labels at the right side
    if (!is.null(genes_hightlight)) {
        if (!all(genes_hightlight %in% genes)) {
            (stop("gene(s) to hightlight are not in the deg list."))
        } else {
            gene_index_tb <- tibble("index" = 1:length(rownames(gex_matrix)), gene = rownames(gex_matrix))
            gene_hightlight_index <- gene_index_tb %>%
                filter(gene %in% gene_hightlight) %>%
                pull(index)
            hightlight_labels <- gene_index_tb %>%
                filter(gene %in% gene_hightlight) %>%
                pull(gene)
            genename_anno <- rowAnnotation(foo = anno_mark(
                at = gene_hightlight_index,
                labels = hightlight_labels, gpar(fontsize = 12)
            ))
        }
    } else {
        (genename_anno <- NULL)
    }
    set.seed(1221)
    if (scale) (gex_matrix <- t(scale(t(gex_matrix))))

    p1 <- ComplexHeatmap::Heatmap(gex_matrix %>% t(),
        left_annotation = col_anno,
        cluster_rows = TRUE,
        col = colorRampPalette(c(nejm_color[2], "white", nejm_color[1]))(100),
        row_names_max_width = unit(10, "cm"),
        #    left_annotation = row_anno,
        right_annotation = genename_anno, #  column_names_rot = 45,
        show_column_names = TRUE, show_row_names = TRUE,
        column_names_gp = gpar(fontsize = 12),
        row_names_gp = gpar(fontsize = 12),
        heatmap_legend_param = list(
            title = "Scaled\nexpression", title_gp = gpar(fontsize = 12),
            legend_height = unit(6, "cm"), direction = "horizontal"
        )
    )
    return(p1)
}


In [None]:
colData(cd4na_rna_psudo)$cohort%>%unique()
colData(cd4na_rna_psudo)$status = factor(if_else(colData(cd4na_rna_psudo)$cohort=='at_risk', 'ARI', 'CON2'),
                                  levels=c('CON2', 'ARI'))
colData(cd4na_rna_psudo)$status%>%unique()

In [None]:
# specify tcr and calsium related genes
tcr_gene <- c(
    "VAV1", "GRB2", "GRAP2", "NFATC2", "NFAT5", "CBL",
    "CARD11", "LCK", "IKBKB", "CD4", "PIK3R1", "MALT1",
    "RAF1", "SOS1", "NCK2", "LCP2", "LAT", "NCK1", "RASGRP1", "MAP2K1",
    "PTPRC", "NFATC3", "MAP3K7", "PPP3CC", "PPP3CA", "GSK3B", "PPP3CC", "PLCG2", "PPP3CA", "CABIN1", "STIM1", "STIM2",
    "CSNK1A1", "LCK", "CD4", "NFATC1",
    "NFATC2IP"
) %>% unique()


In [None]:
# plot the gene heatmap in horizontal
pdf(file.path(fig_path, paste0(proj_name, "_facoter1_rna_tcr_ca_heatmap.pdf")),
    width = 6, height = 4
)
p1 <- PlotDegHeatmap(cd4na_rna_psudo, genes = tcr_gene)
draw(p1, heatmap_legend_side = "top")
dev.off()


In [None]:
options(repr.plot.width = 6, repr.plot.height = 4)
draw(p1, heatmap_legend_side = "top")

In [None]:
# ox_genes <- rna_weight_gsea %>% arrange(NES) %>% mutate(factor='Factor1') %>% 
#     dplyr::filter(str_detect(pathway, 'OXIDATIVE')) %>% pull(leadingEdge) %>% 
#     str_split(', ')
# ox_genes <- paste0('rna_', ox_genes[[1]])
# myc_genes <- rna_weight_gsea %>% arrange(NES) %>% mutate(factor='Factor1') %>% 
#     dplyr::filter(str_detect(pathway, 'HALLMARK_MYC_TARGETS_V1')) %>% pull(leadingEdge) %>% 
#     str_split(', ')
# myc_genes <- paste0('rna_', myc_genes[[1]])

In [None]:
# # plot rna data related to TCR and NFAT signaling pathway
# # ox_genes <-factor1_rna %>%  filter(feature %in% ox_genes& value< (-0.5)) %>%
# #     pull(feature) %>% as.character()
# # ox_genes <- paste0('rna_', ox_genes)
# ox_genes %>% length()
# p3 <- plot_data_heatmap(model, #max.value = 3,
#   view = "rna", 
# annotation_samples='cohort',                      #  fontsize_row=6,
#   factor = 1,  
#   features = ox_genes, 
#    denoise = FALSE,
#   cluster_rows = TRUE, cluster_cols = FALSE,
#   show_rownames = TRUE, show_colnames = FALSE,
#                         #main = "Genes in Ca2+/NFAT pathway",
#   scale = "row", color=colorRampPalette(c(nejm_color[2], "white", nejm_color[1]))(100)
# )
# png(file.path(fig_path, paste0(proj_name, '_facoter1_rna_oxidative_ca_heatmap.png')),
#     units = 'in',res = 300, 
#     width=8, height=8)
# print(p3)
# dev.off()

In [None]:
# # plot rna data related to TCR and NFAT signaling pathway
# # ox_genes <-factor1_rna %>%  filter(feature %in% ox_genes& value< (-0.5)) %>%
# #     pull(feature) %>% as.character()

# myc_genes %>% length()
# p3 <- plot_data_heatmap(model, # max.value = 3,
#   view = "rna",
#   annotation_samples = "cohort", #  fontsize_row=6,
#   factor = 1,
#   features = myc_genes,
#   denoise = FALSE,
#   cluster_rows = TRUE, cluster_cols = TRUE,
#   show_rownames = TRUE, show_colnames = FALSE,
#   # main = "Genes in Ca2+/NFAT pathway",
#   scale = "row", color = colorRampPalette(c(nejm_color[2], "white", nejm_color[1]))(100)
# )
# png(file.path(fig_path, paste0(proj_name, "_facoter1_rna_myc_target_heatmap.png")),
#   units = "in", res = 300,
#   width = 8, height = 8
# )
# print(p3)
# dev.off()


### check ADT

In [None]:
# check the Natural cutoff for tf
p1 <- plot_weights(model,
  view = "adt",
  factor = 1,
  nfeatures = 30,     # Number of features to highlight
  scale = T,          # Scale weights from -1 to 1
  abs = F             # Take the absolute value?
)
p1 + geom_vline(xintercept = 0.5)
ggsave(file.path(fig_path, paste0(proj_name, '_facoter1_adts.png')),
       width=5, height=5)

In [None]:
factor1_adt <- weight %>% filter(factor=='Factor1'&view=='adt') %>% 
    mutate(direction=if_else(value>0, 'healthy', 'at-risk')) %>% arrange(desc(abs(value)))
factor1_adt_sel <- factor1_adt %>% group_by(direction) %>% slice_max( order_by = abs(value),n = 25)
factor1_adt_sel %>% arrange(desc(value)) %>% head(20)

In [None]:
# check exhustion
p1 <- plot_data_scatter(model, factor=1,
                  features = c('adt_TIGIT', 'adt_CD278', 'adt_CD279'), 
                        view='adt', color_by='cohort') 
# p2 <- plot_data_scatter(model, factor=1,
#                   features = c('rna_TIGIT', 'rna_HAVCR2', 'adt_PDCD1'), 
#                         view='rna', color_by='cohort') 
# cowplot::plot_grid(p1, p2, nrow = 2)
p1
ggsave(file.path(fig_path, paste0(proj_name, '_facoter1_adt_correlation_cohort_cd278_cd279_TIGIT.png')),
       width=9, height=3)

In [None]:
plot_data_scatter(model, factor=1,features = c('adt_CX3CR1', 'adt_CD64',  'adt_Ig-light-chain-k'), 
                        view='adt', color_by='cohort') 
ggsave(file.path(fig_path, paste0(proj_name, '_facoter1_adt_correlation_cohort_CX3CR1_fc.png')),
       width=8, height=4)

In [None]:
p1 <- plot_data_heatmap(model, #max.value = 3,
  view = "olink",
  factor = 1,    main='olink', fontsize_row=6,
  features = 20, denoise = FALSE,
  cluster_rows = TRUE, cluster_cols = TRUE,
  show_rownames = TRUE, show_colnames = TRUE,
  scale = "row"
)
png(file.path(fig_path, paste0(proj_name, '_facoter1_olink_top20_heatmap.png')),
    units = 'in',res = 300, width=5, height=5)
print(p1)
dev.off()

In [None]:
plot_data_scatter(model, factor=1, features = c('olink_IL17D'), 
                        view='olink', color_by='cohort') 
ggsave(file.path(fig_path, paste0(proj_name, '_facoter1_olink_correlation_cohort_IL17D.pdf')),
       width=5, height=4)

## run GSEA in MOFA factor 1 rna features

In [None]:
# Load and initialize pathway database
gmxFile <- "/home/jupyter/data/Reference/AT_hallmark_reactome_kegg.symbols.gmt"
colNames <- max(count.fields(file = gmxFile, sep = "\t"))
colNames <- seq(from = 1, to = colNames)
colNames <- as.character(colNames)
gmx <- read.table(
  file = gmxFile,
  sep = "\t",
  quote = "\"",
  fill = TRUE,
  col.names = colNames,
  row.names = 1
)
gmx <- gmx[, -1]
gmx <- apply(gmx, MARGIN = 1, FUN = function(x) {
  return(value = setdiff(unname(x), ""))
})
names(gmx) <- toupper(names(gmx))


In [None]:
# convert the list of pathways to a matrix thats compa

In [None]:
# conver the pathway list into a matrix of 0,1 by genes
pathway_table <- 1:length(gmx) %>%
    lapply(function(i) {
        pathway_tb <- tibble("pathway" = names(gmx[i]), "genes" = gmx[[i]], value = 1) %>%
            mutate(genes = paste0("rna_", genes))
        return(pathway_tb)
    }) %>%
    data.table::rbindlist() %>%
    pivot_wider(id_cols = pathway, names_from = genes, values_from = "value", values_fill = 0)


In [None]:
# conver it into a matrix
pathway_mx <- pathway_table %>% dplyr::select(-pathway) %>% as.matrix()
rownames(pathway_mx) <- pathway_table$pathway

In [None]:
pathway_mx[1:5, 1:5]

In [None]:
# extract the weight of the models
weight <- get_weights(model, views='all',  as.data.frame = TRUE)
weight %>% filter(view=='rna') %>% arrange(value) %>% head()

In [None]:
#' @title \code{RunGSEACelltype}
#'
#' @description \code{RunGSEACelltype} run GSEA pathway enrichment loop by cell type from the deglist
#'
#' @param deg_list data frame contains gene and statistic to rank
#' @param rank.by a column in data frame
#' @param gmx a gmt file for the pathways, default loading a curated list AT_hallmark_reactome_kegg.symbols.gmt
#' @param ct.col column to specify cell type
#' @param collapsePathways whether to collapse Pathways based on fgsea::collapsePathways
#' @param ct.col column to specify cell type
#'
#' @return pathwayDF a data.frame contain the pathway enrichment results

RunGSEA <- function(deg_list, rank.by = "logFC", gmx = NULL, ct.col = "cell_type",
                    collapsePathways = FALSE,
                    ncores = NULL) {
    require(fgsea)
    # if no provided, Load and initialize pathway database
    if (is.null(gmx)) {
        gmxFile <- "/home/jupyter/data/Reference/AT_hallmark_reactome_kegg.symbols.gmt"
        colNames <- max(count.fields(file = gmxFile, sep = "\t"))
        colNames <- seq(from = 1, to = colNames)
        colNames <- as.character(colNames)
        gmx <- read.table(
            file = gmxFile,
            sep = "\t",
            quote = "\"",
            fill = TRUE,
            col.names = colNames,
            row.names = 1
        )
        gmx <- gmx[, -1]
        gmx <- apply(gmx, MARGIN = 1, FUN = function(x) {
            return(value = setdiff(unname(x), ""))
        })
        names(gmx) <- toupper(names(gmx))
    }

    # setup parallelization parameters
    if (is.null(ncores)) {
        ncores <- parallel::detectCores() - 3
    } else {
        (ncores <- ncores)
    }
    param <- BiocParallel::MulticoreParam(workers = ncores, progressbar = TRUE)

    # RUN GSEA per celltype
    celltypes <- unique(deg_list %>% pull(.data[[ct.col]]))

    pLS <- lapply(celltypes, function(ct) {
        message(paste("run GSEA in", ct))

        # create rank list based on lowest to higest gene fold-change
        rnkDF <- deg_list %>%
            dplyr::filter(.data[[ct.col]] == ct) %>%
            dplyr::arrange(.data[[rank.by]])
        rnk <- rnkDF %>%
            pull(.data[[rank.by]]) %>%
            as.numeric()
        names(rnk) <- rnkDF$gene
        message(paste("run GSEA in", length(rnk), "genes"))
        # run GSEA by parallelization
        fgseaRes <- fgsea::fgsea(
            pathways = gmx,
            stats = rnk,
            minSize = 10,
            maxSize = 500,
            BPPARAM = param
        )

        # filter on pathways <0.05 adjusted p-value
        fgseaRes_tb <- fgseaRes %>%
            as.data.frame() %>%
            dplyr::filter(padj < 0.05) %>%
            dplyr::select(pathway, pval, padj, NES, leadingEdge) %>%
            dplyr::arrange(desc(NES)) %>%
            dplyr::mutate(celltype = ct)
        # if only keep the main pathway
        if (collapsePathways) {
            collapsedPathways <- fgsea::collapsePathways(
                fgseaRes[order(pval)][padj < 0.05],
                gmx, rnk
            )
            mainPathways <- fgseaRes[pathway %in% collapsedPathways$mainPathways][
                order(-NES), pathway
            ]
            fgseaRes_tb <- fgseaRes_tb %>% dplyr::filter(pathway %in% mainPathways)
        }

        return(value = fgseaRes_tb)
    })

    pathwayDF <- rbindlist(pLS)
    pathwayDF$leadingEdge <- vapply(pathwayDF$leadingEdge,
        paste,
        collapse = ", ",
        character(1L)
    )
    #  return(pathwayDF)
    # make plotting data frame
    plotDF <- pathwayDF %>%
        mutate(
            group = ifelse(NES > 0, "up", "down"),
            pID = c(1:length(pathway))
        ) %>%
        group_by(pID) %>%
        mutate(lesize = length(unlist(strsplit(leadingEdge, ",")))) %>%
        as_tibble()


    # determine pathway size
    gsSize <- data.frame(gsize = sapply(gmx, function(x) length(x))) %>%
        rownames_to_column(var = "pathway")

    # calculate propotion of genes enriched (#leading edge genes/size of pathway)
    plotDF <- plotDF %>%
        mutate(
            gsize = gsSize$gsize[match(pathway,
                table = gsSize$pathway
            )],
            propGenes = (lesize / gsize) * 100
        )

    return(plotDF)
}


In [None]:
weight %>%
    filter(str_detect(feature, "CAM|LFA|CD31|CLEC|CTLA4") &
        factor == "Factor1" & abs(value) > 0.3) %>%
    arrange(desc(abs(value)))


In [None]:
# use feature loading to run the gsea pathway
rna_weight <- weight %>%
    filter(view == "rna" & factor == "Factor1") %>%
    mutate(gene = str_remove(feature, "rna_"), )
set.seed(1221)
rna_weight_gsea <- RunGSEA(
    deg_list = rna_weight,
    collapsePathways = FALSE, rank.by = "value",
    gmx = gmx, ct.col = "view"
)


In [None]:
ox_genes <- rna_weight_gsea %>%
    arrange(NES) %>%
    mutate(factor = "Factor1") %>%
    dplyr::filter(str_detect(pathway, "OXIDATIVE")) %>%
    pull(leadingEdge) %>%
    str_split(", ")
ox_genes <- paste0("rna_", ox_genes[[1]])
myc_genes <- rna_weight_gsea %>%
    arrange(NES) %>%
    mutate(factor = "Factor1") %>%
    dplyr::filter(str_detect(pathway, "HALLMARK_MYC_TARGETS_V1")) %>%
    pull(leadingEdge) %>%
    str_split(", ")
myc_genes <- paste0("rna_", myc_genes[[1]])


In [None]:
rna_weight_gsea %>% arrange(NES) %>% mutate(factor='Factor1')%>% filter(padj<0.05& pathway %in% metabo_pathways) 

In [None]:
# selece metasbolism related pathways for cd4 t cells
metabo_pathways <- c(
  "HALLMARK_MYC_TARGETS_V1", "HALLMARK_OXIDATIVE_PHOSPHORYLATION",
  "HALLMARK_TNFA_SIGNALING_VIA_NFKB",
  "HALLMARK_HYPOXIA", "HALLMARK_ADIPOGENESIS", "REACTOME_CHROMATIN_MODIFYING_ENZYMES",
  "REACTOME_PHOSPHOLIPID_METABOLISM",
  "REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT",
  "REACTOME_MITOCHONDRIAL_PROTEIN_IMPORT",
  "KEGG_JAK_STAT_SIGNALING_PATHWAY"
)

rna_weight_gsea %>%
  arrange(NES) %>%
  mutate(factor = "Factor1") %>%
  filter(padj < 0.05 & pathway %in% metabo_pathways) %>%
  #  dplyr::filter(str_detect(pathway, 'MYC|METABOLISM|OXIDATIVE_PHOSPHORYLATION')) %>%
  mutate(enriched = if_else(NES < 0, "at-risk", "healthy")) %>%
  ggplot(aes(x = factor, y = pathway, color = NES, size = propGenes)) +
  geom_point() +
  theme_few() +
  scale_colour_gradient2(
    low = scales::muted("blue"),
    mid = "white",
    high = scales::muted("red"),
    midpoint = 0
  ) +
  theme(axis.text.y = element_text(size = 8)) +
  ylab("") +
  xlab("") +
  labs(
    title = NULL,
    color = "NES",
    size = "% Genes\nenriched"
  )
ggsave(file.path(fig_path, paste0(proj_name, "_factor1_rna_GSEA_Pathways_RA.png")),
  width = 8, height = 6
)
