# Cooccurence network using sparCC

In [1]:
setwd("/mnt/c/git_repos/blood_microbial_signatures/")
require(tidyverse)
require(ggplot2)
require(data.table)
require(igraph)
require(Matrix)
require(SpiecEasi)
require(foreach)

Loading required package: tidyverse

“package ‘tidyverse’ was built under R version 4.1.3”
“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.0     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.1.0
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 1.0.0
“package ‘ggplot2’ was built under R version 4.1.3”
“package ‘tibble’ was built under R version 4.1.3”
“package ‘tidyr’ was built under R version 4.1.3”
“package ‘readr’ was built under R version 4.1.3”
“package ‘purrr’ was built under R version 4.1.3”
“package ‘dplyr’ was built under R version 4.1.3”
“package ‘stringr’ was built under R version 4.1.3”
“package ‘forcats’ was built under R 

In [2]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /home/cscstan/miniconda3/envs/r_env/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] foreach_1.5.2     SpiecEasi_1.1.2   Matrix_1.5-3      igraph_1.3.5     
 [5] data.table_1.14.6 forcats_1.0.0     stringr_1.5.0     dplyr_1.1.0      
 [9] purrr_1.0.1       readr_2.1.3       tidyr_1.3.0       tibble_3.1.8     
[13] ggplot2_3.4.0     tidyverse_1.3.2  

loaded via a namespace (and not attached):
 [1] httr_1.4.4          VGAM_1.1-7          j

### Load  data

In [3]:
raw_df <- fread("results/decontamination/read_matrix.raw.zeroed.csv")
raw_prev <- fread("results/decontamination/prevalence_RA0.005_read10.csv") %>%
    column_to_rownames("npm_research_id")

species_df <- fread("results/decontamination/read_matrix_n117.global_decontaminated.zeroed.csv")
nc <- colnames(species_df)[colnames(species_df) != "npm_research_id"]
species_prev <- fread("results/decontamination/prevalence_RA0.005_read10.csv") %>%
    column_to_rownames("npm_research_id") %>%
    select(all_of(nc))
meta <- fread("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv")
meta_filt <- meta %>% filter(npm_research_id %in% raw_df$npm_research_id)

cohorts <- unique(meta_filt$site_supplying_sample)
cohorts <- cohorts[cohorts != "GUSTO"]
# cohorts <- cohorts[cohorts != "SSMP"]

# Remove samples with less than 2 taxa
raw_retain <- raw_prev %>% rowSums() >= 2
raw_df <- raw_df %>% filter(raw_retain)

decon_retain <- species_prev %>% rowSums() >= 2
species_df <- species_df %>% filter(decon_retain)

morsels <- foreach (cohort = cohorts, .combine = "c") %do% {
    id_filt <- (meta_filt %>% filter(site_supplying_sample == cohort))$npm_research_id
    X_filt <- raw_df %>% filter(npm_research_id %in% id_filt)
    temp_list <- list(X_filt)
    names(temp_list) <- cohort
    temp_list
}

cohort_data <- c(morsels)
dim(species_df)
dim(raw_df)
length(cohorts)

In [4]:
get_graph <- function(X, corr_t) {
    set.seed(666)
    
    X_filt <- X %>%
        select(-npm_research_id)
    
    # Run SparCC
    sparcc.amgut <- sparcc(X_filt)

    ## Define arbitrary threshold for SparCC correlation matrix for the graph
    sparcc.graph <- sparcc.amgut$Cor
    sparcc.graph[abs(sparcc.graph) < corr_t] <- 0
    diag(sparcc.graph) <- 0
    sparcc.graph <- Matrix(sparcc.graph, sparse=TRUE)

    ## Create igraph object
    vertex.names <- setNames(seq(ncol(X_filt)), colnames(X_filt))
    ig.sparcc <- adj2igraph(sparcc.graph, vertex.attr = vertex.names)
    V(ig.sparcc)$name <- colnames(X_filt)

    # Remove edgeless vertices
    bad.vs <- V(ig.sparcc)[degree(ig.sparcc) == 0]
    sparcc.filt <- delete.vertices(ig.sparcc, bad.vs)

    return(sparcc.filt)
}


### Intersection of cohort-level graphs made from raw taxonomic profiles

In [5]:
for (cohort in cohorts) {       
    X_temp <- cohort_data[[cohort]]
    n_samples <- nrow(X_temp)
    print(str_glue("{cohort} has {n_samples} samples"))
}

HELIOS has 2282 samples
MEC has 2746 samples
PRISM has 1257 samples
TTSH has 629 samples
SEED has 590 samples


In [6]:
get_ix_graph <- function(corr_t) {
    raw_graphs <- list()

    for (cohort in cohorts) {
        X_temp <- cohort_data[[cohort]]
#         sample_n(30)
        raw_graphs[[cohort]] <- get_graph(X_temp, corr_t)
    }
    
    ix_graph <- intersection(raw_graphs[[1]], raw_graphs[[2]], raw_graphs[[3]],
                             raw_graphs[[4]], raw_graphs[[5]], 
                             keep.all.vertices = F)
    return(list(ix_graph, raw_graphs))
}

In [7]:
gx1_list <- get_ix_graph(0.05)
gx2_list <- get_ix_graph(0.1)
gx3_list <- get_ix_graph(0.2)

gx1 <- gx1_list[[1]]
gx2 <- gx2_list[[1]]
gx3 <- gx3_list[[1]]

gx3_raw_graphs <- gx3_list[[2]]

In [8]:
print(names(gx3_raw_graphs))

[1] "HELIOS" "MEC"    "PRISM"  "TTSH"   "SEED"  


### Graphs made from decontaminated taxonomic profiles

In [9]:
g1 <- get_graph(species_df, 0.05)
g2 <- get_graph(species_df, 0.1)
g3 <- get_graph(species_df, 0.2)
g4 <- get_graph(species_df, 0.3)

In [10]:
g3 <- get_graph(species_df, 0.2)
g4 <- get_graph(species_df, 0.3)

In [11]:
E(g3)
E(g4)

+ 19/19 edges from 75e2ec2 (vertex names):
 [1] Moraxella osloensis         --Acinetobacter schindleri          
 [2] Staphylococcus epidermidis  --Corynebacterium tuberculostearicum
 [3] Gardnerella vaginalis       --Fannyhessea vaginae               
 [4] Bifidobacterium adolescentis--Bifidobacterium longum            
 [5] Bifidobacterium adolescentis--Bifidobacterium bifidum           
 [6] Bifidobacterium adolescentis--Faecalibacterium prausnitzii      
 [7] Bifidobacterium adolescentis--Phocaeicola vulgatus              
 [8] Bifidobacterium adolescentis--Collinsella aerofaciens           
 [9] Bifidobacterium longum      --Bifidobacterium bifidum           
[10] Bifidobacterium longum      --Bifidobacterium breve             
+ ... omitted several edges

+ 4/4 edges from 1153160 (vertex names):
[1] Bifidobacterium longum  --Bifidobacterium bifidum 
[2] Bifidobacterium longum  --Bifidobacterium breve   
[3] Bifidobacterium bifidum --Bifidobacterium breve   
[4] Human betaherpesvirus 6B--Human betaherpesvirus 6A

### Save plots

In [54]:
plot_and_save <- function(g, save_path) {
    # Color edges based on value
    E(g)$color[E(g)$weight < 0 ] <- "steelblue4"
    E(g)$color[E(g)$weight > 0 ] <- "darkolivegreen3"
    
    set.seed(666)
    pdf(file = save_path,
        width = 6.67,
        height = 6.67,
#         units = 'in',
#         res = 300
       )

        plot(g,
             margin = 0,
#              layout = layout.fruchterman.reingold(g),
             layout = layout_in_circle(g, order = V(g)),
             vertex.color = "grey68",
             vertex.size = 8,
             vertex.label = NA,
             vertex.label.cex = 1,
             vertex.label.color = "black",
             vertex.frame.color = NA,
             vertex.label.dist = 1)
    
    dev.off()
}

plot_and_save(g1, "results/coocurrence/SparCC_0.05.pdf")
g_test <- g1
V(g_test)$name <- gsub(" ", "_", names(V(g_test)))
write_graph(g_test, "results/figure_source_data_nat_micro/SparCC_0.05.txt", "ncol")

plot_and_save(g2, "results/coocurrence/SparCC_0.1.pdf")
g_test <- g2
V(g_test)$name <- gsub(" ", "_", names(V(g_test)))
write_graph(g_test, "results/figure_source_data_nat_micro/SparCC_0.1.txt", "ncol")

plot_and_save(g3, "results/coocurrence/SparCC_0.2.pdf")
g_test <- g3
V(g_test)$name <- gsub(" ", "_", names(V(g_test)))
write_graph(g_test, "results/figure_source_data_nat_micro/SparCC_0.2.txt", "ncol")

plot_and_save(g4, "results/coocurrence/SparCC_0.3.pdf")
g_test <- g4
V(g_test)$name <- gsub(" ", "_", names(V(g_test)))
write_graph(g_test, "results/figure_source_data_nat_micro/SparCC_0.3.txt", "ncol")

# plot_and_save(gx1, "results/coocurrence/SparCC_intersection_0.05.pdf")
# write_graph(gx1, "results/figure_source_data_nat_micro/SparCC_intersection_0.05.txt", "edgelist")

# plot_and_save(gx2, "results/coocurrence/SparCC_intersection_0.1.pdf")
# write_graph(gx2, "results/figure_source_data_nat_micro/SparCC_intersection_0.1.txt", "edgelist")

# plot_and_save(gx3, "results/coocurrence/SparCC_intersection_0.2.pdf")
# write_graph(gx3, "results/figure_source_data_nat_micro/SparCC_intersection_0.2.txt", "edgelist")

#### Plot graphs for each cohort

In [66]:
cohort_names <- names(gx3_raw_graphs)

for (i in seq(length(gx3_raw_graphs))) {
    cohort_name <- cohort_names[i]
    g_temp <- gx3_raw_graphs[[cohort_name]]
    plot_and_save(g_temp, str_glue("results/coocurrence/SparCC_{cohort_name}_0.2.pdf"))
    g_test <- g_temp
    V(g_test)$name <- gsub(" ", "_", names(V(g_test)))
    write_graph(g_test, str_glue("results/figure_source_data_nat_micro/SparCC_{cohort_name}_0.2.txt"), "ncol")
}


In [67]:
# Parse txt to csv
all_graphs <- list.files("results/figure_source_data_nat_micro", "SparCC", full.names = T)
all_graphs <- all_graphs[!grepl("\\.csv", all_graphs)]
for (g_path in all_graphs) {
    g_save <- gsub("\\.txt", "\\.csv", g_path)
    read.table(g_path) %>%
        rename(SpiecEasi_correlation = V3) %>%
    fwrite(g_save)
}

### GUSTO only

In [None]:
# gusto_meta <- meta_filt %>% 
#     filter(site_supplying_sample == "GUSTO")
# gusto <- get_graph(species_df %>% filter(npm_research_id %in% gusto_meta$npm_research_id) , 0.3)
# plot_and_save(gusto, str_glue("results/coocurrence/SparCC_GUSTO_decontaminated_0.3.pdf"))