# Motif and Transcription Factor enrichment

The following code was used in the original paper to perform motif enrichment analysis from chromatin accessibility. For more information, visit https://github.com/cantinilab/mowgli_reproducibility.

In addition, we provide example code to perform TF enrichment from gene expression using a TF-target database.

## Motif enrichment

In [None]:
# Python
n_peaks = 100

In [None]:
# Python
# Get the peak dictionaries
H_mowgli = mdata["atac"].uns["H_OT"]

In [None]:
# Python
def top_mowgli(dim, n):
    """
    Get the top n peaks for a given dimension.
    """
    H_scaled = H_mowgli / H_mowgli.sum(axis=1, keepdims=True)
    return H_scaled[:, dim].argsort()[::-1][:n]

In [None]:
# Python
# Initialize the top and bottom peaks.
mdata["atac"].var_names = mdata["atac"].var_names.str.replace("atac:", "")
top_in_mowgli = mdata["atac"].var.copy()

# Fill the Mowgli top peaks.
for dim in range(H_mowgli.shape[1]):
    col_name = f"top_in_dim_{dim}"
    idx = top_in_mowgli.index[top_mowgli(dim, n_peaks)]
    top_in_mowgli[col_name] = False
    top_in_mowgli.loc[idx, col_name] = True

# Save Mowgli's top peaks.
top_in_mowgli.to_csv("top_in_mowgli.csv")

In [None]:
# R
# Imports.
library(GenomicRanges)
library(motifmatchr)
library(chromVAR)
library(TFBSTools)
library(JASPAR2022)
library(Signac)
library(BSgenome.Hsapiens.UCSC.hg38)
library(chromVARmotifs)
library(MuData)

In [None]:
# R
# Read atac file.
in_atac <- "/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/enrich/top_in_mowgli.csv" # nolint
peaks_csv <- read.csv(in_atac, row.names = 2)

In [None]:
# R
# Remove exotic chromosomes.
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "GL000194.1",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "GL000205.2",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "GL000205.2",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "GL000219.1",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "GL000219.1",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "KI270721.1",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "KI270726.1",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "KI270726.1",]
peaks_csv < -peaks_csv[peaks_csv["Chromosome"] != "KI270713.1",]

In [None]:
# R
# Convert the peaks to GRanges.
chromosomes <- peaks_csv["Chromosome"][, 1]
ranges <- IRanges::IRanges(
    start = peaks_csv["Start"][, 1],
    end = peaks_csv["End"][, 1]
)
peaks <- GenomicRanges::GRanges(seqnames = chromosomes, ranges = ranges)

In [None]:
# R
# Get JASPAR motifs.
opts <- list()
opts["species"] <- "Homo sapiens"
opts["collection"] <- "CORE"
motifs <- TFBSTools::getMatrixSet(JASPAR2022::JASPAR2022, opts)
motifs_pwm <- TFBSTools::toPWM(motifs)

# Get cisBP motifs.
data("human_pwms_v2")

# Fuse JASPAR and cisBP motifs.
for (name in names(motifs_pwm)) {
    human_pwms_v2[name] <- motifs_pwm[name]
}

In [None]:
# R
# Create a Signac object from the peaks.
# Actually giving peaks_csv is nonsense.
# But we only care about the rownames so it's fine.
assay <- Signac::CreateChromatinAssay(
    peaks_csv,
    ranges = peaks,
    sep = c(":", "-")
)

# Create statistics about peaks.
assay <- Signac::RegionStats(
    object = assay,
    genome = BSgenome.Hsapiens.UCSC.hg38
)

# Add the downloaded motif PWM annotation.
assay <- Signac::AddMotifs(
    object = assay,
    genome = BSgenome.Hsapiens.UCSC.hg38,
    pfm = human_pwms_v2
)

In [None]:
# R
# Define where to save the motif enrichment outputs.
out_motif <- "motifs_"

# Get all top peaks.
background <- c()
for (dim in 0:49) {

    # Get the top peaks for that dimension.
    features <- rownames(assay)[peaks_csv[paste0("top_in_dim_", dim)] == "True"]

    background <- c(background, features)
}

# Iterate over Mowgli's dimensions.
for (dim in 0:49) {

    # Get the top peaks for that dimension.
    features <- rownames(assay)[peaks_csv[paste0("top_in_dim_", dim)] == "True"]

    # Do motif enrichment analysis.
    enriched_motifs <- Signac::FindMotifs(
        object = assay,
        features = features,
        background = background
    )

    # Save the enrichment.
    write.csv(enriched_motifs, paste0(out_motif, dim, ".csv"))
}