In [72]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(stringr))
library(parallel)
library(reshape2)
library(ggrepel)
library(ggh4x)
library(broom)
source('/home//workspace/private/bri_figure_all_files_test/jupyter//IHA-Figures_Final_V2/helper_function/helper_function_IHA.r')

In [73]:
deseq2_result_Y1D0<-read.csv('/home//workspace/private/bri_figure_all_files_test/jupyter/IHA_Figure_Revision_V1/Figure1/02_DEG/Deseq2_Result_Y1D0.csv')

In [74]:
name_map <- c("naive thymus-derived CD4-positive, alpha-beta T cell" = "Core naive CD4 T cell", 
              "naive thymus-derived CD8-positive, alpha-beta T cell" = "Core naive CD8 T cell",
              "central memory CD4-positive, alpha-beta T cell" = "CM CD4 T cell", 
              "central memory CD8-positive, alpha-beta T cell" = "CM CD8 T cell")

In [75]:
convert_age <- function(age_str) {
  nums <- as.numeric(unlist(regmatches(age_str, gregexpr("\\d+", age_str))))
  if (length(nums) == 1) {
    return(nums)
  } else {
    return(median(nums))
  }}
read_pseudobulk_expression<-function (file_list, mc_cores = 60) 
{
    total_time <- system.time({
        df_list <- mclapply(file_list, function(x) {
            df <- read.csv(x, check.names = FALSE)
            rownames(df) <- make.unique(as.character(df[[1]]))
            df <- df[, -1, drop = FALSE]
            colnames(df) <- ifelse(colnames(df) %in% names(name_map),
                         name_map[colnames(df)],
                         colnames(df))
            colnames(df) <- paste0(gsub("^.*/(.*)\\.csv$", "\\1", 
                x), ":", colnames(df))
            return(df)
        }, mc.cores = mc_cores)
    })
    print(paste("Total reading time:", total_time["elapsed"], 
        "seconds"))
    if (length(df_list) == length(file_list)) {
        print("The length of the list matches the length of the input path.")
    }
    else {
        warning("The length of the list does not match the length of the input path.")
    }
    return(df_list)
}
is_dark_color <- function(hex) {
  rgb <- col2rgb(hex)
  
  luminance <- 0.299 * rgb[1,] + 0.587 * rgb[2,] + 0.114 * rgb[3,]
  
  if(luminance < 128){
  
  return('white')
      
  }else{return('black')}
}

In [76]:
color_annotation<-read.csv('/home//workspace/private/bri_figure_all_files_test/jupyter//IHA-Figures_Final_V2/Color_hex_codes/Cell_Type_Color_Annotation_and_Order.csv')
color_annotation$label<-gsub(" cell",'',color_annotation$label)
color_vector <- setNames(color_annotation$color, color_annotation$label)

In [77]:
meta_data=read.csv("meta_data.csv")

In [78]:
meta_data<-meta_data%>% filter(assay=="10x 5' v2")

In [79]:
table(meta_data$tissue)


                            blood                       bone marrow 
                               17                                18 
               colonic epithelium           epithelial lining fluid 
                                1                                 9 
              inguinal lymph node                jejunal epithelium 
                                9                                17 
           jejunum lamina propria lamina propria of mucosa of colon 
                               19                                 1 
                            liver                              lung 
                                3                                17 
            mesenteric lymph node                      skin of body 
                               21                                 8 
                           spleen               thoracic lymph node 
                               20                                21 

# Thoracic Lymph Node

In [116]:
meta_data_subset<-meta_data %>% filter(tissue=="thoracic lymph node",assay=="10x 5' v2")

In [117]:
dim(meta_data_subset)

In [118]:
meta_data_subset$numeric_ages <- sapply(meta_data_subset$age, convert_age)


In [119]:
meta_data_subset$pbmc_sample_id<-meta_data_subset$sample_id

In [120]:
file_list<-paste0("sample_normalized_count_average/", meta_data_subset$pbmc_sample_id, ".csv")

In [121]:
df_list<-read_pseudobulk_expression(file_list,
                                    mc_cores = 10)

[1] "Total reading time: 1.50999999999999 seconds"
[1] "The length of the list matches the length of the input path."


In [122]:
# filter significant genes 
deseq2_result_Y1D0_AgeGroup_sig<-deseq2_result_Y1D0 %>% 
  filter(contrast == "cohort.cohortGuid") %>%
  filter(padj < 0.05, 
         abs(log2FoldChange) > 0.1)

In [123]:
df_degs_counts<-as.data.frame(table(deseq2_result_Y1D0_AgeGroup_sig$celltype,
                                    deseq2_result_Y1D0_AgeGroup_sig$Direction))

In [124]:
df_degs_counts_filtered<-df_degs_counts %>% filter(Var2=="HigherInBR2") %>% arrange(Var1)   %>% filter(Freq>20)

In [125]:
celltype_list<- c( "Core naive CD4 T cell", "Core naive CD8 T cell",
                             "CM CD4 T cell", "CM CD8 T cell")

In [126]:
for (celltype_single in celltype_list){


    top_n <- 100
    Direction_of_DEG <- "HigherInBR2"
    
    selected_genes <- deseq2_result_Y1D0_AgeGroup_sig %>%
        filter(celltype == celltype_single, Direction == Direction_of_DEG) %>%
        arrange(padj, desc(abs(log2FoldChange))) %>% 
        slice_head(n = top_n) %>%
        select(gene) %>%
        pull()

    print(length(selected_genes))

    selected_genes<-intersect(selected_genes,rownames(df_list[[1]]))
    print(length(selected_genes))
}

[1] 100
[1] 99
[1] 100
[1] 99
[1] 100
[1] 98
[1] 82
[1] 77


In [127]:
score_df_list <- mclapply(celltype_list, function(celltype_single) {
    top_n <- 100
    Direction_of_DEG <- "HigherInBR2"
    
    selected_genes <- deseq2_result_Y1D0_AgeGroup_sig %>%
        filter(celltype == celltype_single, Direction == Direction_of_DEG) %>%
        arrange(padj, desc(abs(log2FoldChange))) %>% 
        slice_head(n = top_n) %>%
        select(gene) %>%
        pull()
    selected_genes<-intersect(selected_genes,rownames(df_list[[1]]))
    print(length(selected_genes))
    long_format <- filter_genes_and_celltype(df_list, selected_genes, celltype_single, longformat = TRUE) %>%
        left_join(meta_data_subset) %>%
        filter(!is.na(Mean_Expression)) %>%
        group_by(Gene) %>%
        mutate(Z_score_Mean_Expression = scale(Mean_Expression)) %>%
        ungroup()
    
    composite_score_df <- long_format %>%
        group_by(pbmc_sample_id) %>%
        summarise(
            composite_score_sum_mean = sum(Mean_Expression, na.rm = TRUE),
            composite_score_sum_scaled_mean = sum(Z_score_Mean_Expression, na.rm = TRUE)
        ) %>%
        left_join(meta_data_subset) %>%
        mutate(celltype = celltype_single) %>%
        as.data.frame()
    
    return(composite_score_df)
}, mc.cores = 10)


In [128]:
composite_score_df_all<-do.call(rbind,score_df_list)

In [129]:
colnames(composite_score_df_all)

In [135]:
composite_score_df_all$Ages<-composite_score_df_all$numeric_ages

In [136]:
composite_score_df_all$Dataset<-"thoracic lymph node"

In [137]:
write.csv(composite_score_df_all,"thoracic_lymph_node.csv")

# mesenteric lymph node

In [139]:
meta_data_subset<-meta_data %>% filter(tissue=="mesenteric lymph node",assay=="10x 5' v2")

In [140]:
dim(meta_data_subset)

In [141]:
meta_data_subset$numeric_ages <- sapply(meta_data_subset$age, convert_age)


In [142]:
meta_data_subset$pbmc_sample_id<-meta_data_subset$sample_id

In [143]:
file_list<-paste0("sample_normalized_count_average/", meta_data_subset$pbmc_sample_id, ".csv")

In [144]:
df_list<-read_pseudobulk_expression(file_list,
                                    mc_cores = 10)

[1] "Total reading time: 1.35500000000002 seconds"
[1] "The length of the list matches the length of the input path."


In [145]:
# filter significant genes 
deseq2_result_Y1D0_AgeGroup_sig<-deseq2_result_Y1D0 %>% 
  filter(contrast == "cohort.cohortGuid") %>%
  filter(padj < 0.05, 
         abs(log2FoldChange) > 0.1)

In [146]:
df_degs_counts<-as.data.frame(table(deseq2_result_Y1D0_AgeGroup_sig$celltype,
                                    deseq2_result_Y1D0_AgeGroup_sig$Direction))

In [147]:
df_degs_counts_filtered<-df_degs_counts %>% filter(Var2=="HigherInBR2") %>% arrange(Var1)   %>% filter(Freq>20)

In [148]:
celltype_list<- c( "Core naive CD4 T cell", "Core naive CD8 T cell","CM CD4 T cell", "CM CD8 T cell")

In [149]:
for (celltype_single in celltype_list){


    top_n <- 100
    Direction_of_DEG <- "HigherInBR2"
    
    selected_genes <- deseq2_result_Y1D0_AgeGroup_sig %>%
        filter(celltype == celltype_single, Direction == Direction_of_DEG) %>%
        arrange(padj, desc(abs(log2FoldChange))) %>% 
        slice_head(n = top_n) %>%
        select(gene) %>%
        pull()

    print(length(selected_genes))

    selected_genes<-intersect(selected_genes,rownames(df_list[[1]]))
    print(length(selected_genes))
}

[1] 100
[1] 99
[1] 100
[1] 99
[1] 100
[1] 98
[1] 82
[1] 77


In [150]:
score_df_list <- mclapply(celltype_list, function(celltype_single) {
    top_n <- 100
    Direction_of_DEG <- "HigherInBR2"
    
    selected_genes <- deseq2_result_Y1D0_AgeGroup_sig %>%
        filter(celltype == celltype_single, Direction == Direction_of_DEG) %>%
        arrange(padj, desc(abs(log2FoldChange))) %>% 
        slice_head(n = top_n) %>%
        select(gene) %>%
        pull()
    selected_genes<-intersect(selected_genes,rownames(df_list[[1]]))
    print(length(selected_genes))
    long_format <- filter_genes_and_celltype(df_list, selected_genes, celltype_single, longformat = TRUE) %>%
        left_join(meta_data_subset) %>%
        filter(!is.na(Mean_Expression)) %>%
        group_by(Gene) %>%
        mutate(Z_score_Mean_Expression = scale(Mean_Expression)) %>%
        ungroup()
    
    composite_score_df <- long_format %>%
        group_by(pbmc_sample_id) %>%
        summarise(
            composite_score_sum_mean = sum(Mean_Expression, na.rm = TRUE),
            composite_score_sum_scaled_mean = sum(Z_score_Mean_Expression, na.rm = TRUE)
        ) %>%
        left_join(meta_data_subset) %>%
        mutate(celltype = celltype_single) %>%
        as.data.frame()
    
    return(composite_score_df)
}, mc.cores = 10)


In [151]:
composite_score_df_all<-do.call(rbind,score_df_list)

In [153]:
composite_score_df_all$Ages<-composite_score_df_all$numeric_ages

In [154]:
composite_score_df_all$Dataset<-"mesenteric lymph node"

In [155]:
write.csv(composite_score_df_all,"mesenteric_lymph_node.csv")