# Linear mixed models analyis
 - mixed model results for CLR and Frequency

In [1]:

# load libraries
quiet_library <- function(...) {
    suppressPackageStartupMessages(library(...))
}
quiet_library(tidyverse)
quiet_library(Seurat)
quiet_library(ggplot2)
quiet_library(Matrix)
quiet_library(dplyr)
quiet_library(viridis)
quiet_library(harmony)
quiet_library(future)
quiet_library(future.apply)
quiet_library(SingleCellExperiment)
quiet_library(SeuratDisk)
quiet_library(ggpubr)
quiet_library(knitr)
quiet_library(plotly)
quiet_library(lme4)
quiet_library(ggpubr)
quiet_library(lmerTest)
quiet_library(broom)

In [2]:
# Check number of cores
future::availableCores()
# Set up parallel processing to run when using 'future' functions 
future::plan(strategy = "multicore", workers = future::availableCores()-5)  
options(future.globals.maxSize = 1000 * 1024^3)
# to turn off parallel processing run line below
# future::plan(strategy = "sequential")

In [3]:
# define the color palette to be used
npg_color <- c("#E64B35FF", "#4DBBD5FF", "#00A087FF", "#3C5488FF", "#F39B7FFF", 
               "#8491B4FF", "#91D1C2FF", "#DC0000FF", "#7E6148FF", "#B09C85FF")
nejm_color <- c("#BC3C29FF", "#0072B5FF", "#E18727FF", "#20854EFF", "#7876B1FF", "#6F99ADFF", "#FFDC91FF", "#EE4C97FF")
jama_color <- c("#374E55FF", "#DF8F44FF", "#00A1D5FF", "#B24745FF", "#79AF97FF", "#6A6599FF", "#80796BFF")
jco_color <- c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF", "#7AA6DCFF", "#003C67FF", "#8F7700FF")
cluster_colors <- c("#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72", "#B17BA6", "#FF7F00", "#FDB462", "#E7298A", 
    "#E78AC3", "#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D", "#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999", 
    "#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000", "#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")

cluster_colors_ext <- colorRampPalette(cluster_colors)(36)
options(repr.plot.width = 10, repr.plot.height = 10)

# Helper Function

In [4]:
#' This function fits a linear mixed-effects model to each cluster in a given dataframe
#' and extracts the summary statistics for the specified term (`days_to_conversion`).
#' The function performs the following steps:
#' 1. Identifies unique clusters in the specified column.
#' 2. Fits a linear mixed-effects model for each cluster.
#' 3. Extracts and tidies the model summary for each cluster.
#' 4. Filters the summary to retain rows corresponding to `days_to_conversion`.
#' 5. Combines the modified summaries for all clusters.
#' 6. Takes the p-value column and produces adjusts p-values using the Benjamini-Hochberg method.
#'
#' @param df A data frame containing the data to be analyzed.
#' @param cluster_column The name of the column containing cluster identifiers.
#' @param formula A formula specifying the linear mixed-effects model to be fitted.
#' @param subject_column The name of the column containing subject identifiers for the random effect.
#' 
#' @return A data frame with the combined modified summaries for all clusters,
#' including the adjusted p-values.
#' 
#' @examples
#' # Assuming A3_clr_v2 is your data frame
#' result <- analyze_clusters(A3_clr_v2, "clusters", clr ~ age_conv + bmi_conv + days_to_conversion + (1 | subject.subjectGuid), "subject.subjectGuid")
#' print(result)
#' 
#' @export
LM_on_clusters <- function(df, cluster_column, formula, subject_column) {
  # Unique clusters in the dataframe
  unique_clusters <- unique(df[[cluster_column]])
  
  # Initialize a list to store model summaries for each cluster
  model_summaries <- list()
  
  # Loop through each cluster
  for (cluster in unique_clusters) {
    # Filter the dataframe for the current cluster
    cluster_data <- df %>% filter(!!sym(cluster_column) == cluster)
    
    # Fit the linear mixed-effects model
    model <- lmerTest::lmer(formula, data = cluster_data)
    
    # Get the summary of the model
    summary_model <- summary(model)
    
    # Store the summary in the list
    model_summaries[[as.character(cluster)]] <- summary_model
  }
  
  # Initialize a list to store tidy summaries for each cluster
  tidy_summaries <- list()
  
  # Loop through each model summary in model_summaries
  for (i in seq_along(model_summaries)) {
    # Extract relevant information from the summary
    coef_summary <- coef(summary(model_summaries[[i]]))
    
    # Tidy up the coefficient summary using broom
    tidy_summary <- as.data.frame(coef_summary)
    tidy_summary$cluster <- names(model_summaries)[i]
    
    # Store the tidy summary in the list
    tidy_summaries[[i]] <- tidy_summary
  }
  
  # Initialize a list to store modified summaries for each cluster
  modified_summaries <- list()
  
  # Loop through each data frame in tidy_summaries
  for (i in seq_along(tidy_summaries)) {
    # Filter rows to retain only days_to_conversion
    days_summary <- subset(tidy_summaries[[i]], rownames(tidy_summaries[[i]]) == "days_to_conversion")
    
    # Make cluster name the row name
    rownames(days_summary) <- days_summary$cluster
    
    # Remove the 'cluster' column
    days_summary <- days_summary[, -ncol(days_summary)]
    
    # Store modified summary in the list
    modified_summaries[[i]] <- days_summary
  }
  
  # Combine all modified summaries into a single data frame
  combined_modified_summary <- do.call(rbind, modified_summaries)
  
  # Rename the p-value column
  combined_modified_summary <- combined_modified_summary %>%
    rename(Pval_lm_DTC = `Pr(>|t|)`)
  
  # Arrange by p-value
  combined_modified_summary <- combined_modified_summary %>%
    arrange(Pval_lm_DTC)
  
  # Adjust p-values using Benjamini-Hochberg method
  combined_modified_summary$p_adj_BH <- p.adjust(combined_modified_summary$Pval_lm_DTC, method = "BH")
  
  # Return the final combined summary
  return(combined_modified_summary)
}


In [5]:
#reading the freq/Clr outputfile 
fq_clr=read.csv("/home/jupyter/ALTRA_ANALYSIS/Deep_clean/certpro/output_files/CD4T_mem_certpro_Frq_clr_res_0_4(c0-c18)_R.csv")
fq_clr%>%head(3)

Unnamed: 0_level_0,X,sample.sampleKitGuid,BMI,days_to_conversion,ageAtDraw,subject.biologicalSex,subject.subjectGuid,bmi_conv,age_conv,Status_Long,Status_Xsec,Age2023,anti_ccp3,anti_ccp3_finalCombined,anti_ccp31,clusters,frequency_live,clr
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<dbl>
1,1,KT00052,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster0,0.10423826,0.9848633
2,2,KT00052,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster1,0.0707331,0.5970978
3,3,KT00052,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster2,0.09621993,0.9048206


In [6]:
meta=read.csv("/home/jupyter/ALTRA_ANALYSIS/Deep_clean/AIM3/input_csv_files/2023-11-22_ALTRA_Metadata_labs.csv")

In [7]:
meta_filtered <- meta %>% 
  filter(subject.subjectGuid != "CU1015")
## becuase CU1015 is not used in long analysis

In [8]:
meta_a3=meta_filtered%>%filter(Status_Long %in% c("pre","conversion"))
meta_a3=select(meta_a3,c(Status_Long,sample.sampleKitGuid))
meta_a3%>%dim
meta_a3%>%head(2)

Unnamed: 0_level_0,Status_Long,sample.sampleKitGuid
Unnamed: 0_level_1,<chr>,<chr>
1,pre,KT00052
2,pre,KT00056


In [9]:
A3_clr <- merge(meta_a3, fq_clr, by='sample.sampleKitGuid', all.x=TRUE)
A3_clr%>%dim
A3_clr%>%head

Unnamed: 0_level_0,sample.sampleKitGuid,Status_Long.x,X,BMI,days_to_conversion,ageAtDraw,subject.biologicalSex,subject.subjectGuid,bmi_conv,age_conv,Status_Long.y,Status_Xsec,Age2023,anti_ccp3,anti_ccp3_finalCombined,anti_ccp31,clusters,frequency_live,clr
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<dbl>
1,KT00052,pre,3,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster2,0.09621993,0.9048206
2,KT00052,pre,1,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster0,0.10423826,0.9848633
3,KT00052,pre,4,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster3,0.06872852,0.5683484
4,KT00052,pre,13,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster12,0.04696449,0.1875759
5,KT00052,pre,2,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster1,0.0707331,0.5970978
6,KT00052,pre,17,24.65716,-714,56,Female,CU1009,25.64892,58,pre,at_risk,60,88,88,74,cluster16,0.02290951,-0.5302639


In [10]:
altra=read.csv("/home/jupyter/ALTRA_ANALYSIS/Deep_clean/Figure_notebooks/files/input_files/certpro_Zi_files/Figure_input_files/ALTRA_Fig1_Master_meta
                    .csv")
altra%>%head(3)
a1=select(altra,c("sample.sampleKitGuid","s1_ids","Status_new"))
a1%>%head(3)


Unnamed: 0_level_0,X,Status_Xsec,Status_Long,sample.sampleKitGuid,sample.drawDate,sample.daysSinceFirstVisit,subject.subjectGuid,subject.biologicalSex,subject.birthYear,days_to_conversion,number_of_days_to_from_ra_conversion_cu,anti_ccp3_finalCombined,status,time,s1,s1_ids,Status_new
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<chr>,<chr>
1,1,ALTRA_healthy,,KT02284,2022-10-01,0,SD1021,Female,1971,,,0,ALTRA_healthy,-2000,CON1,CON101,CON1
2,2,ALTRA_healthy,,KT02286,2022-08-01,0,SD1022,Female,1985,,,2,ALTRA_healthy,-2000,CON1,CON102,CON1
3,3,ALTRA_healthy,,KT02287,2022-05-01,0,SD1015,Female,1963,,,0,ALTRA_healthy,-2000,CON1,CON103,CON1


Unnamed: 0_level_0,sample.sampleKitGuid,s1_ids,Status_new
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,KT02284,CON101,CON1
2,KT02286,CON102,CON1
3,KT02287,CON103,CON1


In [11]:
A3_clr1 <- left_join(A3_clr, a1, by = "sample.sampleKitGuid")
A3_clr1%>%dim
A3_clr1%>%head(3)

Unnamed: 0_level_0,sample.sampleKitGuid,Status_Long.x,X,BMI,days_to_conversion,ageAtDraw,subject.biologicalSex,subject.subjectGuid,bmi_conv,age_conv,⋯,Status_Xsec,Age2023,anti_ccp3,anti_ccp3_finalCombined,anti_ccp31,clusters,frequency_live,clr,s1_ids,Status_new
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,⋯,<chr>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,KT00052,pre,3,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster2,0.09621993,0.9048206,ARI36,LONG
2,KT00052,pre,1,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster0,0.10423826,0.9848633,ARI36,LONG
3,KT00052,pre,4,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster3,0.06872852,0.5683484,ARI36,LONG


In [12]:
##filtering out samples having less than -750 days to conversion
A3_clr_v1<- A3_clr1 %>%
  filter(days_to_conversion >= -750)
A3_clr_v1%>%dim

In [13]:
## removing the male samples to run the lm model
A3_clr_v2 <- A3_clr_v1 %>%
  filter(subject.biologicalSex != "Male")
A3_clr_v2%>%dim

In [14]:
A3_clr_v2%>%head()

Unnamed: 0_level_0,sample.sampleKitGuid,Status_Long.x,X,BMI,days_to_conversion,ageAtDraw,subject.biologicalSex,subject.subjectGuid,bmi_conv,age_conv,⋯,Status_Xsec,Age2023,anti_ccp3,anti_ccp3_finalCombined,anti_ccp31,clusters,frequency_live,clr,s1_ids,Status_new
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,⋯,<chr>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,KT00052,pre,3,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster2,0.09621993,0.9048206,ARI36,LONG
2,KT00052,pre,1,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster0,0.10423826,0.9848633,ARI36,LONG
3,KT00052,pre,4,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster3,0.06872852,0.5683484,ARI36,LONG
4,KT00052,pre,13,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster12,0.04696449,0.1875759,ARI36,LONG
5,KT00052,pre,2,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster1,0.0707331,0.5970978,ARI36,LONG
6,KT00052,pre,17,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster16,0.02290951,-0.5302639,ARI36,LONG


In [15]:
# CLR results
CLR_result <- LM_on_clusters(A3_clr_v2, "clusters", clr ~ age_conv + bmi_conv + days_to_conversion + (1 | subject.subjectGuid), "subject.subjectGuid")
CLR_result

boundary (singular) fit: see help('isSingular')

boundary (singular) fit: see help('isSingular')



Unnamed: 0_level_0,Estimate,Std. Error,df,t value,Pval_lm_DTC,p_adj_BH
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
cluster3,0.0007396233,0.0002207685,46.0,3.35022125,0.001620179,0.03078339
cluster14,0.0006451755,0.0002390432,39.5441,2.69899078,0.010179236,0.09670275
cluster11,0.000729653,0.0002898915,42.90755,2.51698667,0.015653879,0.09914123
cluster7,0.0005710264,0.0002409182,46.0,2.37020882,0.022026453,0.10462565
cluster10,-0.0005955815,0.0002851934,43.50566,-2.08834241,0.042653896,0.15687735
cluster6,-0.0005620883,0.0002826097,44.69436,-1.98892071,0.052852712,0.15687735
cluster8,-0.0005438683,0.0002792639,44.52471,-1.94750658,0.057796919,0.15687735
cluster9,-0.0003046205,0.0001732263,45.72862,-1.75851174,0.085351511,0.20270984
cluster5,-0.0003683013,0.0002425921,45.10196,-1.51819162,0.135945249,0.27260336
cluster16,-0.0005347036,0.0003592527,45.99971,-1.48837744,0.143475452,0.27260336


In [16]:
CLR_result%>%write.csv("/home/jupyter/ALTRA_ANALYSIS/Deep_clean/certpro/output_files/
                                        CD4T_mem_certpro_LM_results_longitudinal_female_res_0_4_CLR.csv")

In [17]:
A3_clr_v2%>%head(3)

Unnamed: 0_level_0,sample.sampleKitGuid,Status_Long.x,X,BMI,days_to_conversion,ageAtDraw,subject.biologicalSex,subject.subjectGuid,bmi_conv,age_conv,⋯,Status_Xsec,Age2023,anti_ccp3,anti_ccp3_finalCombined,anti_ccp31,clusters,frequency_live,clr,s1_ids,Status_new
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,⋯,<chr>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,KT00052,pre,3,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster2,0.09621993,0.9048206,ARI36,LONG
2,KT00052,pre,1,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster0,0.10423826,0.9848633,ARI36,LONG
3,KT00052,pre,4,24.65716,-714,56,Female,CU1009,25.64892,58,⋯,at_risk,60,88,88,74,cluster3,0.06872852,0.5683484,ARI36,LONG


In [18]:
#frequency Results
frequncy_result <- LM_on_clusters(A3_clr_v2, "clusters", frequency_live ~ age_conv + bmi_conv + days_to_conversion + (1 | subject.subjectGuid), "subject.subjectGuid")
frequncy_result

boundary (singular) fit: see help('isSingular')



Unnamed: 0_level_0,Estimate,Std. Error,df,t value,Pval_lm_DTC,p_adj_BH
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
cluster3,4.972718e-05,1.53073e-05,45.99791,3.248592,0.002169847,0.04122709
cluster9,-2.046281e-05,8.253105e-06,44.04027,-2.479407,0.017058706,0.11186936
cluster7,3.806799e-05,1.578683e-05,46.0,2.4113758,0.019942453,0.11186936
cluster14,2.743627e-05,1.166386e-05,40.93061,2.3522471,0.023551444,0.11186936
cluster10,-2.451176e-05,1.181381e-05,40.22011,-2.0748396,0.044441705,0.13576276
cluster11,3.234471e-05,1.566561e-05,43.65912,2.064696,0.044923127,0.13576276
cluster17,-5.016243e-06,2.489275e-06,44.04736,-2.0151423,0.050017859,0.13576276
cluster18,-9.183091e-07,4.869945e-07,45.4768,-1.8856662,0.065737148,0.14737398
cluster8,-2.421384e-05,1.302339e-05,43.20965,-1.8592576,0.069808728,0.14737398
cluster6,-2.165236e-05,1.289313e-05,41.87024,-1.6793718,0.100528001,0.17991496


In [19]:
frequncy_result%>%write.csv("/home/jupyter/ALTRA_ANALYSIS/Deep_clean/certpro/output_files/
                                        CD4T_mem_certpro_LM_results_longitudinal_female_res_0_4_Frequency.csv")