# Cooccurence network using sparCC

In [1]:
setwd("/mnt/c/Users/Cedric/Desktop/git_repos/blood_microbiome")
require(tidyverse)
require(ggplot2)
require(data.table)
require(igraph)
require(Matrix)
require(SpiecEasi)
require(foreach)

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: data.table


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘p

In [2]:
sessionInfo()

R version 4.1.0 (2021-05-18)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /home/csctan/miniconda3/envs/R-environment/lib/libopenblasp-r0.3.17.so

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] foreach_1.5.2     SpiecEasi_1.1.2   Matrix_1.3-4      igraph_1.2.9     
 [5] data.table_1.14.2 forcats_0.5.1     stringr_1.4.0     dplyr_1.0.7      
 [9] purrr_0.3.4       readr_2.1.0       tidyr_1.1.4       tibble_3.1.6     
[13] ggplot2_3.3.5     tidyverse_1.3.1  

loaded via a namespace (and not attached):
 [1] httr_1.4.2       VGAM_1.1-6       

### Load  data

In [28]:
1526/8828

In [65]:
raw_df <- fread("results/decontamination/read_matrix.raw.zeroed.csv")
raw_prev <- fread("results/decontamination/prevalence_RA0.005_read10.csv") %>%
    column_to_rownames("npm_research_id")

species_df <- fread("results/decontamination/read_matrix_n122.global_decontaminated.zeroed.csv")
nc <- colnames(species_df)[colnames(species_df) != "npm_research_id"]
species_prev <- fread("results/decontamination/prevalence_RA0.005_read10.csv") %>%
    column_to_rownames("npm_research_id") %>%
    select(all_of(nc))
meta <- fread("data/SG10K_Health_metadata.n10714.16March2021.parsed.csv")
meta_filt <- meta %>% filter(npm_research_id %in% raw_df$npm_research_id)

cohorts <- unique(meta_filt$site_supplying_sample)
cohorts <- cohorts[cohorts != "GUSTO"]
cohorts <- cohorts[cohorts != "SSMP"]

# Remove samples with less than 2 taxa
raw_retain <- raw_prev %>% rowSums() >= 2
raw_df <- raw_df %>% filter(raw_retain)

decon_retain <- species_prev %>% rowSums() >= 2
species_df <- species_df %>% filter(decon_retain)

morsels <- foreach (cohort = cohorts, .combine = "c") %do% {
    id_filt <- (meta_filt %>% filter(site_supplying_sample == cohort))$npm_research_id
    X_filt <- raw_df %>% filter(npm_research_id %in% id_filt)
    temp_list <- list(X_filt)
    names(temp_list) <- cohort
    temp_list
}

cohort_data <- c(morsels)
dim(species_df)
dim(raw_df)
length(cohorts)

In [66]:
get_graph <- function(X, corr_t) {
    set.seed(666)
    
    X_filt <- X %>%
        select(-npm_research_id)
    
    # Run SparCC
    sparcc.amgut <- sparcc(X_filt)

    ## Define arbitrary threshold for SparCC correlation matrix for the graph
    sparcc.graph <- sparcc.amgut$Cor
    sparcc.graph[abs(sparcc.graph) < corr_t] <- 0
    diag(sparcc.graph) <- 0
    sparcc.graph <- Matrix(sparcc.graph, sparse=TRUE)

    ## Create igraph object
    vertex.names <- setNames(seq(ncol(X_filt)), colnames(X_filt))
    ig.sparcc <- adj2igraph(sparcc.graph, vertex.attr = vertex.names)
    V(ig.sparcc)$name <- colnames(X_filt)

    # Remove edgeless vertices
    bad.vs <- V(ig.sparcc)[degree(ig.sparcc) == 0]
    sparcc.filt <- delete.vertices(ig.sparcc, bad.vs)

    return(sparcc.filt)
}


### Intersection of cohort-level graphs made from raw taxonomic profiles

In [67]:
for (cohort in cohorts) {       
    X_temp <- cohort_data[[cohort]]
    n_samples <- nrow(X_temp)
    print(str_glue("{cohort} has {n_samples} samples"))
}

HELIOS has 2281 samples
MEC has 2693 samples
PRISM has 1248 samples
TTSH has 628 samples
SERI has 590 samples


In [None]:
get_ix_graph <- function(corr_t) {
    raw_graphs <- list()

    for (cohort in cohorts) {
        X_temp <- cohort_data[[cohort]]
#         sample_n(30)
        raw_graphs[[cohort]] <- get_graph(X_temp, corr_t)
    }
    
    ix_graph <- intersection(raw_graphs[[1]], raw_graphs[[2]], raw_graphs[[3]],
                             raw_graphs[[4]], raw_graphs[[5]], 
                             keep.all.vertices = F)
    return(list(ix_graph, raw_graphs))
}

gx1_list <- get_ix_graph(0.05)
gx2_list <- get_ix_graph(0.1)
gx3_list <- get_ix_graph(0.2)

gx1 <- gx1_list[[1]]
gx2 <- gx2_list[[1]]
gx3 <- gx3_list[[1]]

gx3_raw_graphs <- gx3_list[[2]]

### Graphs made from decontaminated taxonomic profiles

In [6]:
g1 <- get_graph(species_df, 0.05)
g2 <- get_graph(species_df, 0.1)
g3 <- get_graph(species_df, 0.2)

### Save plots

In [7]:
plot_and_save <- function(g, save_path) {
    # Color edges based on value
    E(g)$color[E(g)$weight < 0 ] <- "steelblue4"
    E(g)$color[E(g)$weight > 0 ] <- "darkolivegreen3"
    
    set.seed(666)
    png(file = save_path,
        width = 6.67,
        height = 6.67,
        units = 'in',
        res = 300)

        plot(g,
             margin = 0,
#              layout = layout.fruchterman.reingold(g),
             layout = layout_in_circle(g, order = V(g)),
             vertex.size = 8,
             vertex.label.cex = 1,
             vertex.label.color = "black",
             vertex.frame.color = NA,
             vertex.label.dist = 1)
    
    dev.off()
}

plot_and_save(g1, "results/coocurrence/SparCC_0.05.png")
plot_and_save(g2, "results/coocurrence/SparCC_0.1.png")
plot_and_save(g3, "results/coocurrence/SparCC_0.2.png")
plot_and_save(gx1, "results/coocurrence/SparCC_intersection_0.05.2.png")
plot_and_save(gx2, "results/coocurrence/SparCC_intersection_0.1.png")
plot_and_save(gx3, "results/coocurrence/SparCC_intersection_0.2.png")

#### Plot graphs for each cohort

In [8]:
cohort_names <- names(gx3_raw_graphs)

for (i in seq(length(gx3_raw_graphs))) {
    cohort_name <- cohort_names[i]
    g_temp <- gx3_raw_graphs[[cohort_name]]
    plot_and_save(g_temp, str_glue("results/coocurrence/SparCC_{cohort_name}_0.2.png"))
}

### GUSTO only

In [10]:
gusto_meta <- meta_filt %>% 
    filter(site_supplying_sample == "GUSTO")
gusto <- get_graph(species_df %>% filter(npm_research_id %in% gusto_meta$npm_research_id) , 0.3)
plot_and_save(gusto, str_glue("results/coocurrence/SparCC_GUSTO_decontaminated_0.3.png"))