# Co-occurrence Analysis

This code is ported over from ISCHIA, specifically the `ISCHIA.cooccur` function.

In [51]:
using Muon
using RData
using Rmath
using RCall
using Random
using DataFrames
using Statistics
using ProgressMeter
using Combinatorics
using ISCHIA
using Revise

In [52]:
mdata = readh5mu("../data/mudata.h5mu")
lr_network = load("../data/lr_network.rds")

gene_names = mdata["SCT"].var.name
# mdata["SCT"].var_names = gene_names

# Create LR_Pairs column
lr_network[!, :LR_Pairs] = string.(lr_network.from, "_", lr_network.to);
lr_network = lr_network[:, [:from, :to, :LR_Pairs]]

# Filter lr_network based on conditions
from_filter = in.(lr_network[:, :from], Ref(gene_names))
to_filter = in.(lr_network[:, :to], Ref(gene_names))
all_LR_network = lr_network[from_filter .& to_filter, :]

# To reduce the computation time for this example, we randomly sample from the whole dataset of LR interactions

# all_LR_network = all_LR_network[shuffle(1:size(all_LR_network_exp, 1)), :]
# all_LR_network = all_LR_network[1501:min(2000, end), :]

# Extract unique genes and common genes
all_LR_genes = unique(vcat(all_LR_network[:, :from], all_LR_network[:, :to]))
all_LR_genes_comm = intersect(all_LR_genes, collect(gene_names));

# Create LR.pairs and LR.pairs.AllCombos
LR_pairs = all_LR_network[:, :LR_Pairs]
all_combos = [join(combo, "_") for combo in combinations(all_LR_genes_comm, 2)];

adata = mdata["Spatial"]
COI = ["CC4"]
Condition = unique(adata.obs[!, "orig.ident"])
LR_list = all_LR_genes_comm
LR_pairs = LR_pairs
exp_th = 1
corr_th = 0.2

println("Preparing L-R presence/absence matrix")

# Subset the expression matrix for the interested ligands and receptors
spatial_obj_exp_LR_subset_raw = adata[:, in.(adata.var.name, Ref(LR_list))]

# Binarize the expression matrix based on the expression threshold
spatial_obj_exp_LR_subset_raw_binary = spatial_obj_exp_LR_subset_raw.layers["counts"] .> exp_th
spatial_obj_exp_LR_subset_raw.layers["binary"] = spatial_obj_exp_LR_subset_raw_binary

LR_subset_raw_binary_mask_col = vec(sum(spatial_obj_exp_LR_subset_raw_binary, dims=1) .> 0)
LR_subset_raw_binary_mask_row = vec(sum(spatial_obj_exp_LR_subset_raw_binary, dims=2) .> 0)

LR_presence_absence = spatial_obj_exp_LR_subset_raw[LR_subset_raw_binary_mask_row, LR_subset_raw_binary_mask_col]


# Filter spots based on COI and Condition
mask = (adata.obs[:, "CompositionCluster_CC"] .∈ Ref(COI)) .& (adata.obs[:, "orig.ident"] .∈ Ref(Condition))
COI_spots = adata.obs_names[mask]
rest_of_spots = setdiff(adata.obs_names, COI_spots)

println("Calculating L-R pairs correlation")
COI_cors_adata = spatial_obj_exp_LR_subset_raw[mask, :]
COI_cors = cor(Array(COI_cors_adata.layers["counts"]))
COI_cors[isnan.(COI_cors)] .= 0.0

println("Preparing for cooccurrence")
common_spots = intersect(LR_presence_absence.obs_names, COI_spots)
coocur_COI = LR_presence_absence[common_spots, :]
coocur_COI_exp = DataFrame(Matrix(transpose(coocur_COI.layers["binary"])), common_spots)

describe(coocur_COI_exp);

Preparing L-R presence/absence matrix
Calculating L-R pairs correlation


Preparing for cooccurrence


In [53]:
cooccur_COI_res = calculate_cooccurrence_stats_refactored(Matrix(coocur_COI_exp), coocur_COI.var.name; spp_names=true)

[32mCalculate Incidence  37%|████████████                    |  ETA: 0:00:00[39m[K

[32mCalculate Incidence 100%|████████████████████████████████| Time: 0:00:00[39m[K


[32mCalculate Co-occurrences   2%|█                          |  ETA: 0:00:07[39m[K

[32mCalculate Co-occurrences  26%|███████                    |  ETA: 0:00:01[39m[K

[32mCalculate Co-occurrences  63%|██████████████████         |  ETA: 0:00:00[39m[K

[32mCalculate Co-occurrences 100%|███████████████████████████| Time: 0:00:00[39m[K


CooccurOutput([1m532×11 DataFrame[0m
[1m Row [0m│[1m sp1     [0m[1m sp2     [0m[1m sp1_inc [0m[1m sp2_inc [0m[1m obs_cooccur [0m[1m prob_cooccur [0m[1m exp_cooc[0m ⋯
     │[90m Integer [0m[90m Integer [0m[90m Integer [0m[90m Integer [0m[90m Integer     [0m[90m Real         [0m[90m Real    [0m ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │       2       81        1       14            1         0.071           ⋯
   2 │       3       81        1       14            1         0.071
   3 │       4       81        1       14            1         0.071
   4 │       6       81        1       14            1         0.071
   5 │      10       81        1       14            1         0.071           ⋯
   6 │      17       81        1       14            1         0.071
   7 │      49       81        1       14            1         0.071
   8 │      64       81        1       14            1         0.071
  ⋮  │    ⋮    

summarize_cooccur

In [47]:
summarize_cooccur(cooccur_COI_res);

Of 318801 species pair combinations, 318269 pairs (99.83%) were removed from the analysis because expected co-occurrence was < 1 and
532 pairs were analyzed

Cooccurrence Summary:

Species => 799
Non-random (%) => 9.6
Sites => 14
Negative => 0
Random => 422
Positive => 51
Unclassifiable => 59


In [4]:
cooccur_COI_res.spp_names

799-element Vector{String}:
 "AGRN"
 "TNFRSF4"
 "TNFRSF14"
 "TNFRSF25"
 "TNFRSF9"
 "CLSTN1"
 "AGTRAP"
 "TNFRSF1B"
 "EPHA2"
 "HSPG2"
 ⋮
 "EFNB1"
 "DLG3"
 "IL2RG"
 "CXCR3"
 "P2RY10"
 "AMOT"
 "GPC4"
 "L1CAM"
 "PLXNA3"

In [14]:
c.spp_key = cooccur_COI_res[:spp_key]

Row,num,spp
Unnamed: 0_level_1,Int64,String
1,1,AGRN
2,2,TNFRSF4
3,3,TNFRSF14
4,4,TNFRSF25
5,5,TNFRSF9
6,6,CLSTN1
7,7,AGTRAP
8,8,TNFRSF1B
9,9,EPHA2
10,10,HSPG2


In [16]:
c.spp_names = c.spp_key.num

799-element Vector{Int64}:
   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
   ⋮
 791
 792
 793
 794
 795
 796
 797
 798
 799

In [54]:
cooccur_COI_res

Dict{Symbol, Any} with 15 entries:
  :percent_sig          => 9.58647
  :pairs                => 532
  :pot_pairs            => 318801
  :spp_names            => ["AGRN", "TNFRSF4", "TNFRSF14", "TNFRSF25", "TNFRSF9…
  :sites                => [14 14 … 14 14; 14 14 … 14 14; … ; 14 14 … 14 14; 14…
  :species              => 799
  :true_rand_classifier => 0.1
  :negative             => 0
  :co_occurrences       => 51
  :random               => 422
  :unclassifiable       => 59
  :results              => [1m532×11 DataFrame[0m[0m…
  :positive             => 51
  :spp_key              => [1m799×2 DataFrame[0m[0m…
  :omitted              => 318269

In [16]:
axes(cooccur_COI_res.results, 1)

Base.OneTo(532)

In [17]:
size(cooccur_COI_res.results, 1)

532

In [29]:
cooccurr = cooccur_COI_res
ptab = cooccurr[:results]
"sp1_name" in names(ptab)

true

In [30]:
if "sp1_name" in names(ptab)
    sp1 = string.(ptab.sp1_name)
    sp2 = string.(ptab.sp2_name)
else
    sp1 = ptab.sp1
    sp2 = ptab.sp2
end

532-element Vector{String}:
 "PIGR"
 "PIGR"
 "PIGR"
 "PIGR"
 "PIGR"
 "PIGR"
 "PIGR"
 "PIGR"
 "PIGR"
 "CD46"
 ⋮
 "RPS19"
 "TIMP1"
 "RPS19"
 "APOE"
 "RRBP1"
 "TNFRSF6B"
 "COL18A1"
 "MIF"
 "TIMP1"

In [34]:
length(sp2)

532

In [36]:
standardized = true
if standardized
    Nmat = cooccurr[:sites]
    rawtab = cooccurr[:results]
    rawsp1 = rawtab.sp1
    rawsp2 = rawtab.sp2
    rawobs = ptab.obs_cooccur
    rawexp = ptab.exp_cooccur

    effs = DataFrame(sp1 = Any[], sp2 = Any[], effects = Any[])
    
    for i in 1:length(sp1)
        eff = (rawobs[i] - rawexp[i]) / Nmat[rawsp1[i], rawsp2[i]]
        push!(effs, [sp1[i], sp2[i], eff])
    end
    
else
    effs = DataFrame(sp1=sp1, sp2=sp2, effects=ptab.obs_cooccur - ptab.exp_cooccur)
end

In [52]:
"""
    effect_sizes(cooccur; standardized=true)

Calculate effect sizes for co-occurrence analysis results.

# Arguments
- `cooccur`: A co-occurrence analysis object.
- `standardized`: A boolean indicating whether to calculate standardized effect sizes.

# Returns
Effect sizes as a data frame.

# Example
```julia
effect_sizes(cooccur_object)
"""
function effect_sizes(cooccur; standardized=true)
    cooccur_results = cooccur[:results]
    if "sp1_name" in names(cooccur_results)
        species1 = string.(cooccur_results.sp1_name)
        species2 = string.(cooccur_results.sp2_name)
    else
        species1 = cooccurrence_results.sp1
        species2 = cooccurrence_results.sp2
    end

    if standardized
        site_matrix = cooccur[:sites]
        raw_species1 = cooccur_results.sp1
        raw_species2 = cooccur_results.sp2
        raw_observed = cooccur_results.obs_cooccur
        raw_expected = cooccur_results.exp_cooccur

        effect_sizes_df = DataFrame(
            species1=String[],
            species2=String[],
            effects=Float64[]
        )

        for i in 1:length(species1)
            effect = (raw_observed[i] - raw_expected[i]) / site_matrix[raw_species1[i], raw_species2[i]]
            push!(effect_sizes_df, [species1[i], species2[i], effect])
        end

    else
        effect_sizes_df = DataFrame(
            species1=species1,
            species2=species2,
            effects=cooccur_results.obs_cooccur - cooccur_results.exp_cooccur
        )
    end

    return effect_sizes_df
end

effect_sizes

In [53]:
effect_sizes(cooccur_COI_res; standardized=true)

Row,species1,species2,effects
Unnamed: 0_level_1,String,String,Float64
1,TNFRSF4,PIGR,0.0
2,TNFRSF14,PIGR,0.0
3,TNFRSF25,PIGR,0.0
4,CLSTN1,PIGR,0.0
5,HSPG2,PIGR,0.0
6,YARS,PIGR,0.0
7,ADAM15,PIGR,0.0
8,CD247,PIGR,0.0
9,PTPRC,PIGR,0.0
10,PIGR,CD46,0.0


## Co-occurrence Workbench

In [3]:
"""
Calculate the co-occurrence matrix N from a binary species-site matrix.

This function creates a species by species matrix of potential co-occurring sites (N) from a binary species by site matrix, where 1 represents potential occupancy, and 0 indicates species absence.

# Arguments
- `mat::Matrix{Int}`: A binary species by site matrix.

# Returns
A species by species matrix where the upper triangle contains N for each species pair.

# Examples
```julia
# Define a binary species by site matrix
# species_matrix = rand(Bool, num_species, num_sites)

# Calculate the co-occurrence matrix N
# cooccurrence_matrix = create_N_matrix(species_matrix)

"""
function calculate_cooccurrence_matrix(mat::Matrix{Int})
    num_species = size(mat, 1)
    cooccurrence_matrix = zeros(Int, num_species, num_species)
    
    for i in 1:num_species
        for j in (i + 1):num_species
            cooccurrence_matrix[i, j] = sum(mat[i, :] .* mat[j, :])
            cooccurrence_matrix[j, i] = cooccurrence_matrix[i, j]
        end
    end
    
    return cooccurrence_matrix    
end

# finches = load("../data/finches.rda")["finches"]
# N_matrix = rand(Bool, nrow(finches), ncol(finches))
# create_N_matrix(N_matrix)

calculate_cooccurrence_matrix

In [4]:
"""
    calculate_conditional_probability(max_successes, successes, min_successes, total_trials)

Calculate the conditional probability using binomial coefficients.

# Arguments
- `successes::Int`: The number of successful trials.
- `min_successes::Int`: The minimum number of successful trials.
- `max_successes::Int`: The maximum number of successful trials.
- `total_trials::Int`: The total number of trials.

# Returns
The calculated conditional probability as a floating-point number.
"""
function calculate_conditional_probability(
    successes::Int, min_successes::Int, max_successes::Int, total_trials::Int
    )::Real

    # Calculate the numerator using binomial coefficients
    numerator = binomial(max_successes, successes) * binomial(total_trials - max_successes, min_successes - successes)
    
    # Calculate the denominator using binomial coefficients
    denominator = binomial(total_trials, min_successes)
    
    # Calculate and return the conditional probability
    return numerator / denominator
end

# calculate_conditional_probability(4, 10, 20, 50)
# Verified

calculate_conditional_probability

In [5]:
function ISCHIA_cooccur(mat, row_names; type="spp_site", thresh=true, spp_names=false,
    true_rand_classifier=0.1, prob="hyper", site_mask=nothing,
    only_effects=false, eff_standard=true, eff_matrix=false)

    if type == "spp_site"
        spp_site_mat = mat
    elseif type == "site_spp"
        spp_site_mat = transpose(mat)
    else
        error("Invalid 'type' parameter")
    end

    if spp_names
        spp_key = DataFrame(num=1:nrow(spp_site_mat), spp=row_names)
    end

    if !isnothing(site_mask)
        if size(site_mask) == size(spp_site_mat)
            N_matrix = calculate_cooccurrence_matrix(site_mask)
        else
            error("Incorrect dimensions for 'site_mask', aborting.")
        end
    else
        site_mask = ones(Int, size(spp_site_mat))
        N_matrix = size(spp_site_mat, 2) * ones(Int, (size(spp_site_mat, 1), size(spp_site_mat, 1)))
    end

    tsites = size(spp_site_mat, 2)
    nspp = size(spp_site_mat, 1)
    spp_pairs = binomial(nspp, 2)

    incidence = zeros(Int, size(N_matrix))
    prob_occur = zeros(size(N_matrix))
    
    obs_cooccur = zeros(Int, spp_pairs, 3)
    prob_cooccur = zeros(spp_pairs, 3)
    exp_cooccur = zeros(spp_pairs, 3)

    mat_matrix = Matrix(mat)
    @showprogress "Calculate Incidence" for spp in 1:nspp
        if spp < nspp
            for spp_next in (spp + 1):nspp
                incidence[spp, spp_next] = sum(site_mask[spp, :] .* site_mask[spp_next, :] .* mat_matrix[spp, :])
                incidence[spp_next, spp] = sum(site_mask[spp, :] .* site_mask[spp_next, :] .* mat_matrix[spp_next, :])
            end
        end
    end

    prob_occur .= incidence ./ N_matrix;

    row = 0
    @showprogress "Calculate Co-occurrences" for spp in 1:nspp
        if spp < nspp
            for spp_next in (spp + 1):nspp
                pairs = sum(mat_matrix[spp, site_mask[spp, :] .* site_mask[spp_next, :] .== 1] .== 1 .&
                    mat_matrix[spp_next, site_mask[spp, :] .* site_mask[spp_next, :] .== 1] .== 1)
                row += 1
                obs_cooccur[row, 1] = spp
                obs_cooccur[row, 2] = spp_next
                obs_cooccur[row, 3] = pairs
                prob_cooccur[row, 1] = spp
                prob_cooccur[row, 2] = spp_next
                prob_cooccur[row, 3] = prob_occur[spp, spp_next] * prob_occur[spp_next, spp]
                exp_cooccur[row, 1] = spp
                exp_cooccur[row, 2] = spp_next
                exp_cooccur[row, 3] = prob_cooccur[row, 3] * N_matrix[spp, spp_next]
            end
        end
    end

    if thresh
        n_pairs = size(prob_cooccur, 1)
        mask = exp_cooccur[:, 3] .>= 1
        prob_cooccur = prob_cooccur[mask, :]
        obs_cooccur = obs_cooccur[mask, :]
        exp_cooccur = exp_cooccur[mask, :]
        n_omitted = n_pairs - size(prob_cooccur, 1)
    end

    output = DataFrame(sp1=Integer[], sp2=Integer[], sp1_inc=Integer[], sp2_inc=Integer[], 
        obs_cooccur=Integer[], prob_cooccur=Real[], exp_cooccur=Real[], p_lt=Real[], p_gt=Real[])

    @showprogress "Main Comp" for row in 1:size(obs_cooccur, 1)
        sp1 = obs_cooccur[row, 1]
        sp2 = obs_cooccur[row, 2]
        sp1_inc = convert(Integer, incidence[sp1, sp2])
        sp2_inc = convert(Integer, incidence[sp2, sp1])
        max_inc = max(sp1_inc, sp2_inc)
        min_inc = min(sp1_inc, sp2_inc)
        nsite = N_matrix[sp1, sp2]
        psite = nsite + 1
        prob_share_site = zeros(Float64, psite)
        
        if prob == "hyper"
            if !only_effects
                all_probs = phyper.(0:min_inc, min_inc, nsite - min_inc, max_inc)
                prob_share_site[1] = all_probs[1]
                for j in 2:length(all_probs)
                    prob_share_site[j] = all_probs[j] - all_probs[j - 1]
                end
            else
                for j in 0:nsite
                    if (sp1_inc + sp2_inc) <= (nsite + j)
                        if j <= min_inc
                            prob_share_site[j + 1] = 1
                        end
                    end
                end
            end
        end
    
        if prob == "comb"
            if !only_effects
                for j in 0:nsite
                    if (sp1_inc + sp2_inc) <= (nsite + j)
                        if j <= min_inc
                            prob_share_site[j + 1] = calculate_conditional_probability(j, min_inc, max_inc, nsite)
                        end
                    end
                end
            else
                for j in 0:nsite
                    if (sp1_inc + sp2_inc) <= (nsite + j)
                        if j <= min_inc
                            prob_share_site[j + 1] = 1
                        end
                    end
                end
            end
        end
    
        p_lt = 0.0
        p_gt = 0.0
        for j in 0:nsite
            if j <= obs_cooccur[row, 3]
                p_lt += prob_share_site[j + 1]
            end
            if j >= obs_cooccur[row, 3]
                p_gt += prob_share_site[j + 1]
            end
            if j == obs_cooccur[row, 3]
                p_exactly_obs = prob_share_site[j + 1]
            end
        end
        
        p_lt = round(p_lt, digits=5)
        p_gt = round(p_gt, digits=5)
        p_exactly_obs = round(p_exactly_obs, digits=5)
        prob_cooccur[row, 3] = round(prob_cooccur[row, 3], digits=3)
        exp_cooccur[row, 3] = round(exp_cooccur[row, 3], digits=1)
        
        push!(output, [sp1, sp2, sp1_inc, sp2_inc, obs_cooccur[row, 3],
                        prob_cooccur[row, 3], exp_cooccur[row, 3], p_lt, p_gt])
    end

    if spp_names
        sp1_name = leftjoin(DataFrame(order = 1:length(output.sp1), sp1 = output.sp1), spp_key, on = :sp1 => :num, makeunique = true)
        sp2_name = leftjoin(DataFrame(order = 1:length(output.sp2), sp2 = output.sp2), spp_key, on = :sp2 => :num, makeunique = true)
        
        output.sp1_name = sp1_name[sortperm(sp1_name.order), "spp"]
        output.sp2_name = sp2_name[sortperm(sp2_name.order), "spp"]
    end

    true_rand = count(x -> (x.p_gt >= 0.05 && x.p_lt >= 0.05 && abs(x.obs_cooccur - x.exp_cooccur) <= (tsites * true_rand_classifier)), eachrow(output))

    output_dict = Dict(:results => output,
                       :positive => count(x -> x.p_gt < 0.05, eachrow(output)),
                       :negative => count(x -> x.p_lt < 0.05, eachrow(output)),
                       :co_occurrences => count(x -> x.p_gt < 0.05 || x.p_lt < 0.05, eachrow(output)),
                       :pairs => size(output, 1),
                       :random => true_rand,
                       :unclassifiable => size(output, 1) - (true_rand + count(x -> x.p_gt < 0.05, eachrow(output)) + count(x -> x.p_lt < 0.05, eachrow(output))),
                       :sites => N_matrix,
                       :species => nspp,
                       :percent_sig => count(x -> x.p_gt < 0.05 || x.p_lt < 0.05, eachrow(output)) / size(output, 1) * 100,
                       :true_rand_classifier => true_rand_classifier)

    if spp_names
        output_dict[:spp_key] = spp_key
        output_dict[:spp_names] = row_names
    else
        output_dict[:spp_names] = 1:size(spp_site_mat, 1)
    end

    if thresh
        output_dict[:omitted] = n_omitted
        output_dict[:pot_pairs] = n_pairs
    end

    return output_dict
    # if !only_effects
    #     return output_dict
    # else
    #     return effect_sizes(output_dict, standardized=eff_standard, matrix=eff_matrix)
    # end
end

ISCHIA_cooccur (generic function with 1 method)

In [6]:
output = ISCHIA_cooccur(coocur_COI_exp, coocur_COI.var.name; spp_names=true)

[32mCalculate Co-occurrences   1%|█                          |  ETA: 0:00:17[39m[K

[32mCalculate Co-occurrences 100%|███████████████████████████| Time: 0:00:00[39m[K


Dict{Symbol, Any} with 15 entries:
  :percent_sig          => 25.0
  :pairs                => 36
  :pot_pairs            => 15576
  :spp_names            => ["AGRN", "TNFRSF14", "TNFRSF1B", "EPHB2", "PTCH2", "…
  :sites                => [13 13 … 13 13; 13 13 … 13 13; … ; 13 13 … 13 13; 13…
  :species              => 177
  :true_rand_classifier => 0.1
  :negative             => 0
  :co_occurrences       => 9
  :random               => 20
  :unclassifiable       => 7
  :results              => [1m36×11 DataFrame[0m[0m…
  :positive             => 9
  :spp_key              => [1m177×2 DataFrame[0m[0m…
  :omitted              => 15540

In [32]:
output[:results]

Row,sp1,sp2,sp1_inc,sp2_inc,obs_cooccur,prob_cooccur,exp_cooccur,p_lt,p_gt,sp1_name,sp2_name
Unnamed: 0_level_1,Integer,Integer,Integer,Integer,Integer,Real,Real,Real,Real,String?,String?
1,23,36,4,5,3,0.118,1.5,0.99301,0.11888,CXCR4,RPSA
2,23,85,4,5,3,0.118,1.5,0.99301,0.11888,CXCR4,CCL21
3,23,108,4,4,2,0.095,1.2,0.94825,0.35385,CXCR4,A2M
4,23,117,4,4,2,0.095,1.2,0.94825,0.35385,CXCR4,HSP90B1
5,23,129,4,8,3,0.189,2.5,0.9021,0.48951,CXCR4,THBS1
6,23,156,4,5,4,0.118,1.5,1.0,0.00699,CXCR4,C3
7,36,60,5,3,2,0.089,1.2,0.96503,0.31469,RPSA,IL7R
8,36,85,5,5,4,0.148,1.9,0.99922,0.03186,RPSA,CCL21
9,36,103,5,3,3,0.089,1.2,1.0,0.03497,RPSA,GSTP1
10,36,108,5,4,3,0.118,1.5,0.99301,0.11888,RPSA,A2M


In [None]:
"""
Calculate the co-occurrence matrix N from a binary species-site matrix.

This function creates a species by species matrix of potential co-occurring sites (N) from a binary species by site matrix, where 1 represents potential occupancy, and 0 indicates species absence.

# Arguments
- `mat::Matrix{Int}`: A binary species by site matrix.
- `row_names::Vector{String}`: Names of species corresponding to rows in the matrix.
- `type::String`: Type of analysis ("spp_site" or "site_spp").
- `thresh::Bool`: Apply a threshold to exclude low co-occurrence pairs.
- `spp_names::Bool`: Include species names in the output.
- `true_rand_classifier::Float64`: Classifier for identifying truly random pairs.
- `prob::String`: Probability calculation method ("hyper" or "comb").
- `site_mask::Matrix{Int}`: Binary site mask.
- `only_effects::Bool`: Calculate effect sizes if true.
- `eff_standard::Bool`: Use standardized effect sizes.
- `eff_matrix::Bool`: Output effect sizes as a matrix.

# Returns
A dictionary with various co-occurrence statistics and options.

"""
function calculate_cooccurrence_stats(
    mat::Matrix{Bool},
    row_names::Vector{String},
    type::String = "spp_site",
    thresh::Bool = true,
    spp_names::Bool = false,
    true_rand_classifier::Float64 = 0.1,
    prob::String = "hyper",
    site_mask::Union{Nothing, Matrix{Int}} = nothing,
    only_effects::Bool = false,
    eff_standard::Bool = true,
    eff_matrix::Bool = false
)
    # Initialize variables and matrices
    spp_site_mat = get_species_site_matrix(mat, type)
    spp_key = spp_names ? DataFrame(num=1:nrow(spp_site_mat), spp=row_names) : nothing
    site_mask, N_matrix = create_site_mask_and_N_matrix(site_mask, spp_site_mat)
    
    nspp = size(spp_site_mat, 1)
    spp_pairs = binomial(nspp, 2)
    
    incidence = calculate_incidence_matrix(mat, site_mask)
    prob_occur = calculate_probability_matrix(incidence, N_matrix)
    
    obs_cooccur, prob_cooccur, exp_cooccur = calculate_cooccurrence_data(mat, site_mask, spp_pairs, prob_occur)
    
    if thresh
        obs_cooccur, prob_cooccur, exp_cooccur, n_omitted = apply_threshold(obs_cooccur, prob_cooccur, exp_cooccur)
    end
    
    output = create_output_dataframe(obs_cooccur, prob_cooccur, exp_cooccur, spp_names, spp_key)
    
    true_rand = count_true_random_pairs(output, nspp, spp_pairs, true_rand_classifier)
    
    output_dict = create_output_dict(output, spp_names, row_names, thresh, n_omitted, spp_site_mat, nspp, true_rand_classifier)
    
    if only_effects
        output_dict = calculate_effect_sizes(output_dict, eff_standard, eff_matrix)
    end

    return output_dict
end

# Define the remaining functions here...

# Example of usage:
# result = calculate_cooccurrence_stats(mat, row_names)