In [52]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(stringr))
library(ggrepel)
library(reshape2)
library(ggsci)
library(rstatix)
library(ggpubr)
library(data.table)
library(ggtern)
library(viridis)
library(scales)
library(RColorBrewer)
library(parallel)
library(purrr)
source('/home//jupyter/BRI_Figures_Final_V1/helper_function/helper_function_IHA.r')

# Read Meta Data of Selected Samples

In [53]:
meta_data<-read.csv("/home/jupyter/BRI_Figures_Final_V2/Figure5/01_Frequency_Comparison/selected_samples_with_acutal_flu_year.csv")

# Flow Tfh Log2Fc

In [54]:
freq=read.csv('/home/jupyter/BRI_Figures_Final_V1/Extended-Figure14/01_Flow_Tfh/freq_T.csv')

In [55]:
last_chars <- sapply(freq$sample.visitName, function(string) {
  substr(string, nchar(string), nchar(string))
})
freq$visit<-paste0("Y",freq$Flu_Year,'-D',last_chars)
freq<-freq %>% group_by(sample.sampleKitGuid) %>%     
  dplyr::mutate(frequency_clr = clr_transform((frequency))) %>%
  dplyr::ungroup()


In [56]:
result_pd1pos <- freq %>%
  filter(Flu_Year == '2020') %>%
  filter(labels %in% c( "CXCR5+ PD1+ Tfh")) %>%
  group_by(subject.subjectGuid) %>%
  summarize(mean_diff = frequency_clr[visit == 'Y2020-D7'] - frequency_clr[visit == 'Y2020-D0']) %>%
  ungroup()

In [57]:
result_pd1neg <- freq %>%
  filter(Flu_Year == '2020') %>%
  filter(labels %in% c( "CXCR5+ PD1- Tfh")) %>%
  group_by(subject.subjectGuid) %>%
  summarize(mean_diff = frequency_clr[visit == 'Y2020-D7'] - frequency_clr[visit == 'Y2020-D0']) %>%
  ungroup()

In [58]:
colnames(result_pd1pos)<-c("subject.subjectGuid","flow_Tfh_PD1pos_mean_diff")
colnames(result_pd1neg)<-c("subject.subjectGuid","flow_Tfh_PD1neg_mean_diff")

In [59]:
flow_variable<-left_join(result_pd1pos,result_pd1neg)

[1m[22mJoining with `by = join_by(subject.subjectGuid)`


# Composite Score

In [60]:
meta_data_selected_RNA_subset_D0<-meta_data %>% filter(Flu_Year=="2020-2021",sample.visitName	%in%c("Flu Year 1 Day 0") ) 
file_list<-paste0("/home/jupyter/BRI_Figures_Final_V2/Dataset/scRNA/BRI/Average_LogNormalized_Expression/Average_LogNormalized_Expression_of_Celltypes_by_Sample_AIFI_L3/", meta_data_selected_RNA_subset_D0$pbmc_sample_id, ".csv")

In [61]:
df_list<-read_pseudobulk_expression(file_list, mc_cores = 60)

[1] "Total reading time: 5.54300000000001 seconds"
[1] "The length of the list matches the length of the input path."


### DESEQ2 result

In [62]:
deseq2_result_Y1D0<-read.csv('/home/jupyter/BRI_Figures_Final_V1/Figure2/02_DEG/Deseq2_Result_Y1D0.csv')

In [63]:
# filter significant genes 
deseq2_result_Y1D0_AgeGroup_sig<-deseq2_result_Y1D0 %>% 
  filter(contrast == "cohort.cohortGuid") %>%
  filter(padj < 0.05,
         abs(log2FoldChange) > 0.1)

### Get Composite Score

In [64]:
df_degs_counts<-as.data.frame(table(deseq2_result_Y1D0_AgeGroup_sig$celltype,deseq2_result_Y1D0_AgeGroup_sig$Direction))
df_degs_counts_filtered<-df_degs_counts %>% arrange(Var1)   %>% filter(Freq>20)

In [65]:
score_df_list<-mclapply(unique(df_degs_counts_filtered$Var1),function(celltype_single){

top_n=100
Direction_of_DEG="HigherInBR2"
selected_genes <- deseq2_result_Y1D0_AgeGroup_sig %>%
    filter(celltype == celltype_single,Direction==Direction_of_DEG) %>% 
    arrange(padj,desc(abs(log2FoldChange))) %>% 
    slice_head(n=top_n)%>%
    select(gene) %>%
    pull()
 
long_format <- filter_genes_and_celltype(df_list, 
                                       selected_genes, 
                                       celltype_single, 
                                       longformat = TRUE)
long_format <- left_join(long_format, meta_data_selected_RNA_subset_D0)
# z scored (optional)
long_format<-long_format %>% group_by(Gene) %>% 
mutate(Z_score_Mean_Expression=scale(Mean_Expression))  %>% 
                                        ungroup()
composite_score_df<-long_format%>% 
group_by(pbmc_sample_id) %>%
  dplyr::summarise(composite_score_sum_mean = sum(Mean_Expression),
                   composite_score_sum_scaled_mean = sum(Z_score_Mean_Expression)) %>% as.data.frame() 
composite_score_df<-left_join(composite_score_df,meta_data_selected_RNA_subset_D0) 
composite_score_df$celltype<-celltype_single
return(composite_score_df)
},mc.cores=10)

In [66]:
composite_score_df_all<-do.call(rbind,score_df_list)

In [67]:
composite_score_df_all<-composite_score_df_all %>% filter (celltype=="CM CD4 T cell")

In [68]:
composite_score_variable_Up<-composite_score_df_all[c("subject.subjectGuid","composite_score_sum_scaled_mean")]
colnames(composite_score_variable_Up)<-c("subject.subjectGuid","Up_composite_score_sum_scaled_mean")

In [69]:
score_df_list<-mclapply(unique(df_degs_counts_filtered$Var1),function(celltype_single){

top_n=100
Direction_of_DEG="HigherInBR1"
selected_genes <- deseq2_result_Y1D0_AgeGroup_sig %>%
    filter(celltype == celltype_single,Direction==Direction_of_DEG) %>% 
    arrange(padj,desc(abs(log2FoldChange))) %>% 
    slice_head(n=top_n)%>%
    select(gene) %>%
    pull()
 
long_format <- filter_genes_and_celltype(df_list, 
                                       selected_genes, 
                                       celltype_single, 
                                       longformat = TRUE)
long_format <- left_join(long_format, meta_data_selected_RNA_subset_D0)
# z scored (optional)
long_format<-long_format %>% group_by(Gene) %>% 
mutate(Z_score_Mean_Expression=scale(Mean_Expression))  %>% 
                                        ungroup()
composite_score_df<-long_format%>% 
group_by(pbmc_sample_id) %>%
  dplyr::summarise(composite_score_sum_mean = sum(Mean_Expression),
                   composite_score_sum_scaled_mean = sum(Z_score_Mean_Expression)) %>% as.data.frame() 
composite_score_df<-left_join(composite_score_df,meta_data_selected_RNA_subset_D0) 
composite_score_df$celltype<-celltype_single
return(composite_score_df)
},mc.cores=10)

In [70]:
composite_score_df_all<-do.call(rbind,score_df_list)

In [71]:
composite_score_df_all<-composite_score_df_all %>% filter (celltype=="CM CD4 T cell")

In [72]:
composite_score_variable_Down<-composite_score_df_all[c("subject.subjectGuid","composite_score_sum_scaled_mean")]
colnames(composite_score_variable_Down)<-c("subject.subjectGuid","Down_composite_score_sum_scaled_mean")

# NMF scores

In [73]:
meta_data_selected_RNA_subset_D0<-meta_data %>% filter(Flu_Year=="2020-2021",
                                                       sample.visitName	%in%c("Flu Year 1 Day 0") ) 

df_list<-mclapply(meta_data_selected_RNA_subset_D0$pbmc_sample_id,function(x){
df<-read.csv(paste0("/home/jupyter/BRI_Figures_Final_V1/Figure6/02_NMF_CD4_T_cell_Projection/NMF_Score_BRI/",x,".csv"))
df$pbmc_sample_id=x
return(df)
},mc.cores=60)


In [74]:
df_combined<-do.call(rbind,df_list)

In [75]:
df_combined_longformat<- pivot_longer(df_combined, cols = starts_with("NMF"), names_to = "NMF_Scores", values_to = "value")

In [76]:
df_combined_longformat<-df_combined_longformat %>% group_by(pbmc_sample_id,NMF_Scores,AIFI_L3) %>% summarise(AverageScore=mean(value))

[1m[22m`summarise()` has grouped output by 'pbmc_sample_id', 'NMF_Scores'. You can
override using the `.groups` argument.


In [77]:
df_combined_longformat<-left_join(df_combined_longformat,meta_data_selected_RNA_subset_D0) %>% ungroup()

[1m[22mJoining with `by = join_by(pbmc_sample_id)`


In [78]:
df_combined_longformat_filtered_TH2<-df_combined_longformat %>% filter(NMF_Scores %in% c('NMF5_Th2'))%>% 
                           filter(AIFI_L3 %in% c('CM CD4 T cell')) %>% select(subject.subjectGuid,AverageScore) %>% as.data.frame()
df_combined_longformat_filtered_Tfh<-df_combined_longformat %>% filter(NMF_Scores %in% c('NMF6_Tfh'))%>% 
                           filter(AIFI_L3 %in% c('CM CD4 T cell'))%>% select(subject.subjectGuid,AverageScore)%>% as.data.frame()

In [79]:
colnames(df_combined_longformat_filtered_TH2)<-c("subject.subjectGuid","NMF_Th2")
colnames(df_combined_longformat_filtered_Tfh)<-c("subject.subjectGuid","NMF_Tfh")

In [80]:
nmf_variable<-left_join(df_combined_longformat_filtered_TH2,df_combined_longformat_filtered_Tfh)

[1m[22mJoining with `by = join_by(subject.subjectGuid)`


# cell to cell interaction scores

In [81]:
meta_data_selected_RNA_subset_D0<-meta_data %>% filter(Flu_Year=="2020-2021",sample.visitName	%in%c("Flu Year 1 Day 0") ) 


In [82]:
combined_significant_means<-read.csv("/home/jupyter/BRI_Figures_Final_V1/Figure6/01_CellPhoneDB/combined_statistical_analysis_significant_means.csv",check.names = FALSE)

In [83]:
combined_significant_means_long<-pivot_longer(
  combined_significant_means,
  cols = matches("\\|"), # Select columns with pattern "|"
  names_to = c("variable"),
  values_to = "mean_interaction"
)

In [84]:
combined_significant_means_long<-combined_significant_means_long[!is.na(combined_significant_means_long$mean_interaction),]

In [85]:
combined_significant_means_long<-combined_significant_means_long %>% filter(pbmc_sample_id%in% meta_data_selected_RNA_subset_D0$pbmc_sample_id)%>% filter(pbmc_sample_id%in% meta_data_selected_RNA_subset_D0$pbmc_sample_id)

In [86]:
combined_significant_means_long<-left_join(combined_significant_means_long,meta_data_selected_RNA_subset_D0)

[1m[22mJoining with `by = join_by(pbmc_sample_id)`


In [87]:
CD40LG_CD40_MEAN<-combined_significant_means_long %>% filter(interacting_pair=="CD40LG_CD40",
                                                      variable=="CM CD4 T cell|Core memory B cell")%>% 
select(subject.subjectGuid,mean_interaction)%>% as.data.frame()
PTPRC_CD22_MEAN<-combined_significant_means_long %>% filter(interacting_pair=="PTPRC_CD22",
                                                      variable=="CM CD4 T cell|Core memory B cell")%>% 
select(subject.subjectGuid,mean_interaction)%>% as.data.frame()

In [88]:
colnames(CD40LG_CD40_MEAN)<-c("subject.subjectGuid","InteractionScore_CD40LG_CD40")
colnames(PTPRC_CD22_MEAN)<-c("subject.subjectGuid","InteractionScore_PTPRC_CD22")

In [89]:
c2c_interaction_variable<-left_join(CD40LG_CD40_MEAN,PTPRC_CD22_MEAN)

[1m[22mJoining with `by = join_by(subject.subjectGuid)`


# Mean Expression

In [90]:
meta_data_selected_RNA_subset_D0<-meta_data %>% filter(Flu_Year=="2020-2021",sample.visitName	%in%c("Flu Year 1 Day 0") ) 


In [91]:
aggregated_count_file_list<-paste0("/home/jupyter/BRI_Figures_Final_V1/Dataset/scRNA/BRI/Average_LogNormalized_Expression/Average_LogNormalized_Expression_of_Celltypes_by_Sample_AIFI_L3/",meta_data_selected_RNA_subset_D0$pbmc_sample_id,".csv")
df_list<-read_pseudobulk_expression(aggregated_count_file_list)

[1] "Total reading time: 7.23699999999997 seconds"
[1] "The length of the list matches the length of the input path."


In [92]:
selected_genes<-c("CD40LG","CXCR5")


In [93]:
df<-filter_genes_and_celltype(df_list, selected_genes, c('CM CD4 T cell'), longformat = TRUE)




In [94]:
df<-left_join(df,meta_data_selected_RNA_subset_D0)

[1m[22mJoining with `by = join_by(pbmc_sample_id)`


In [95]:
expression_CD40LG<-df %>% filter(Gene=="CD40LG") %>% 
select(subject.subjectGuid,Mean_Expression)%>% as.data.frame()

expression_CXCR5<-df %>% filter(Gene=="CXCR5") %>% 
select(subject.subjectGuid,Mean_Expression)%>% as.data.frame()

In [96]:
colnames(expression_CD40LG)<-c("subject.subjectGuid","Mean_Expression_CD40LG")
colnames(expression_CXCR5)<-c("subject.subjectGuid","Mean_Expression_CXCR5")

In [97]:
mean_expression_variable<-reduce(list(expression_CD40LG,expression_CXCR5), full_join, by = "subject.subjectGuid")

# Combine all together

In [98]:
combined_df <- reduce(list(nmf_variable,
                           flow_variable,
                           composite_score_variable_Down,
                           composite_score_variable_Up,
                           c2c_interaction_variable,
                           mean_expression_variable), 
                      left_join, 
                      by = "subject.subjectGuid")

In [99]:
write.csv(combined_df,"BRI_Tcell_data_forcorrelation.csv")