In [1]:
import os
os.chdir("/home/alsun/get/1_methyl_pre_070725/")
print(os.getcwd())

/home/alsun/get/1_methyl_pre_070725


In [2]:
import os
from pathlib import Path

from gcell._settings import get_setting
from preprocess_utils import (
    add_atpm,
    add_exp,
    create_peak_motif,
    download_motif,
    get_motif,
    query_motif,
)

annotation_dir = Path(get_setting('annotation_dir'))
print("gcell currently using annotation directory:", annotation_dir)

gcell currently using annotation directory: /home/alsun/.gcell_data/annotations


In [7]:
motif_bed_url = "https://resources.altius.org/~jvierstra/projects/motif-clustering/releases/v1.0/hg38.archetype_motifs.v1.0.bed.gz"
motif_bed_index_url = "https://resources.altius.org/~jvierstra/projects/motif-clustering/releases/v1.0/hg38.archetype_motifs.v1.0.bed.gz.tbi"


if (
    motif_bed_url
    and motif_bed_index_url
    and not (
        (annotation_dir / "hg38.archetype_motifs.v1.0.bed.gz").exists()
        or (annotation_dir / "hg38.archetype_motifs.v1.0.bed.gz.tbi").exists()
    )
):
    download_motif(motif_bed_url, motif_bed_index_url, motif_dir=annotation_dir)
    motif_bed = str(annotation_dir / "hg38.archetype_motifs.v1.0.bed.gz")
else:
    motif_bed = str(annotation_dir / "hg38.archetype_motifs.v1.0.bed.gz")

In [8]:
peak_bed = "Lamp5_Gaba.CGN.bed" # since all cell types share the same peak set, when querying motifs, we can just use one cell type to query motifs.
peaks_motif = query_motif(peak_bed, motif_bed)
get_motif_output = get_motif(peak_bed, peaks_motif)

Peak motif extraction completed. Results saved in get_motif.bed


In [5]:
get_motif_output

'get_motif.bed'

In [9]:
import zarr
import pandas as pd
import numpy as np
def create_peak_motif(peak_motif_bed, output_zarr, peak_bed):
    """
    Create a peak motif zarr file from a peak motif bed file.

    This function reads a peak motif bed file, pivots the data, and saves it to a zarr file.
    The zarr file contains three datasets: 'data', 'peak_names', 'motif_names', and 'accessibility'.
    The 'data' dataset is a sparse matrix containing the peak motif data.
    The 'peak_names' dataset contains the peak names.
    The 'motif_names' dataset contains the motif names.

    Args:
        peak_motif_bed (str): Path to the peak motif bed file.
        output_zarr (str): Path to the output zarr file.
    """
    import pandas as pd
    motif_annotations = pd.read_excel('https://resources.altius.org/~jvierstra/projects/motif-clustering/releases/v1.0/motif_annotations.xlsx')
    motif_cluster_ids = motif_annotations.Name.unique()
    # Read the peak motif bed file
    peak_motif = pd.read_csv(
        peak_motif_bed,
        sep="\t",
        header=None,
        names=["Chromosome", "Start", "End", "Motif_cluster", "Score"],
    )

    # Pivot the data
    peak_motif_pivoted = peak_motif.pivot_table(
        index=["Chromosome", "Start", "End"],
        columns="Motif_cluster",
        values="Score",
        fill_value=0,
    )

    peak_motif_pivoted.reset_index(inplace=True)
    # add missing motif columns
    for motif_cluster_id in motif_cluster_ids:
        if motif_cluster_id not in peak_motif_pivoted.columns:
            peak_motif_pivoted[motif_cluster_id] = 1
    # Create the 'Name' column
    peak_motif_pivoted["Name"] = peak_motif_pivoted.apply(
        lambda x: f'{x["Chromosome"]}:{x["Start"]}-{x["End"]}', axis=1
    )
    peak_motif_pivoted = peak_motif_pivoted.drop(columns=["Chromosome", "Start", "End"])

    # Read the original peak bed file
    original_peaks = pd.read_csv(
        peak_bed, sep="\t", header=None, names=["Chromosome", "Start", "End", "Score"]
    )

    # exclude chrM and chrY
    original_peaks = original_peaks[~original_peaks.Chromosome.isin(["chrM", "chrY"])]
    original_peaks["Name"] = original_peaks.apply(
        lambda x: f'{x["Chromosome"]}:{x["Start"]}-{x["End"]}', axis=1
    )
    
    new_columns = list(motif_cluster_ids) + ["Name"]

    # sort motif columns
    peak_motif_pivoted = peak_motif_pivoted[new_columns]

    # Merge the pivoted data with the original peaks
    merged_data = pd.merge(original_peaks, peak_motif_pivoted, on="Name", how="left")

    # Fill NaN values with 0 for motif columns
    motif_columns = [
        col
        for col in merged_data.columns
        if col not in ["Chromosome", "Start", "End", "Score", "Name"]
    ]
    
    merged_data[motif_columns] = merged_data[motif_columns].fillna(0)
    peak_length = (merged_data.End - merged_data.Start).values / 400 # convert to kb
    merged_data[motif_columns] = merged_data[motif_columns].div(peak_length, axis=0)
    # Prepare data for zarr storage
    name_values = list(merged_data["Name"].values)
    motif_values = motif_columns

    # Create sparse matrix
    motif_data_matrix = merged_data[motif_columns].values
    # Open zarr store and save data
    from numcodecs import Blosc

    z = zarr.open(output_zarr, mode="w")
    z.create_dataset(
        "data",
        data=motif_data_matrix.data,
        chunks=(1000, motif_data_matrix.shape[1]),
        dtype=np.float32,
        compressor=Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE),
        shape=motif_data_matrix.shape,
    )
    z.create_dataset("peak_names", data=name_values)
    z.create_dataset("motif_names", data=motif_values)

    print(f"Peak motif data saved to {output_zarr}")

create_peak_motif(get_motif_output, "methyl_DMR.zarr", peak_bed) # all cell types will later be added to the same zarr file as we use the same peak set.

Peak motif data saved to methyl_DMR.zarr


In [4]:
# Read names from a file into a list
with open("bed_names.txt", "r") as f:
    celltype_for_modeling = [line.strip() for line in f]

print(celltype_for_modeling)
len(celltype_for_modeling)

['Astro-NT_NN', 'Astro-TE_NN', 'CA1-ProS_Glut', 'CA3_Glut', 'CB_Granule_Glut', 'CBX_Golgi_Gly-Gaba', 'CBX_Purkinje_Gaba', 'COAa-PAA-MEA_Barhl2_Glut', 'DG_Glut', 'IC_Tfap2d_Maf_Glut', 'IT_AON-TT-DP_Glut', 'IT_EP-CLA_Glut', 'L2_3_IT_CTX_Glut', 'L2_3_IT_ENT_Glut', 'L2_3_IT_PPP_Glut', 'L2_3_IT_RSP_Glut', 'L2_IT_ENT-po_Glut', 'L5_6_IT_TPE-ENT_Glut', 'L5_ET_CTX_Glut', 'L5_IT_CTX_Glut', 'L5_NP_CTX_Glut', 'L6b_CT_ENT_Glut', 'L6b_CTX_Glut', 'L6_CT_CTX_Glut', 'L6_IT_CTX_Glut', 'LA-BLA-BMA-PA_Glut', 'Lamp5_Gaba', 'LSX_Prdm12_Zeb2_Gaba', 'Microglia_NN', 'MY_Lhx1_Gly-Gaba', 'NDB-SI-MA-STRv_Lhx8_Gaba', 'Oligo_NN', 'OPC_NN', 'PB_Lmx1a_Glut', 'PG-TRN-LRN_Fat2_Glut', 'PRT_Tcf7l2_Gaba', 'Pvalb_Gaba', 'RT-ZI_Gnb3_Gaba', 'SCig_Foxb1_Glut', 'SCs_Dmbx1_Gaba', 'SI-MPO-LPO_Lhx8_Gaba', 'Sncg_Gaba', 'SNc-VTA-RAmb_Foxa1_Dopa', 'SPA-SPFm-SPFp-POL-PIL-PoT_Sp9_Glut', 'Sst_Gaba', 'STR_D1_Gaba', 'STR_D1_Sema5a_Gaba', 'STR_D2_Gaba', 'STR-PAL_Chst9_Gaba', 'SUB-ProS_Glut', 'TH_Prkcd_Grin2c_Glut', 'Vip_Gaba', 'VLMC_NN', 

54

In [11]:
#celltype_for_modeling = ["L2-3"]
for cell_type in celltype_for_modeling:
    add_atpm(
        "methyl_DMR.zarr",
        f"{cell_type}.CGN.bed",
        cell_type,
    )

        Chromosome      Start        End      aTPM                      Name
0             chr1    3003639    3003641  0.115845      chr1:3003639-3003641
1             chr1    3005997    3005999  0.572754      chr1:3005997-3005999
2             chr1    3007169    3007171  0.882568      chr1:3007169-3007171
3             chr1    3007429    3007683  0.720459      chr1:3007429-3007683
4             chr1    3012839    3012841  0.702636      chr1:3012839-3012841
...            ...        ...        ...       ...                       ...
1781943       chrX  170844078  170844080  0.312500  chrX:170844078-170844080
1781944       chrX  170844465  170844467  0.163819  chrX:170844465-170844467
1781945       chrX  170849226  170849228  0.000000  chrX:170849226-170849228
1781946       chrX  170854561  170854563  0.000000  chrX:170854561-170854563
1781947       chrX  170860316  170860318  0.000000  chrX:170860316-170860318

[1781948 rows x 5 columns]
        Chromosome      Start        End      aT

In [14]:
celltype_for_modeling = ["Lamp5"]
for cell_type in celltype_for_modeling:
    add_atpm(
        "methyl_DMR_peak.zarr",
        "Lamp5_Gaba.CGN.bed",
        cell_type,
    )

       Chromosome      Start        End      aTPM                      Name
0            chr1    3003639    3003641  0.375000      chr1:3003639-3003641
1            chr1    3003639    3003641  0.375000      chr1:3003639-3003641
2            chr1    3027143    3027145  0.497559      chr1:3027143-3027145
3            chr1    3032861    3032863  0.455566      chr1:3032861-3032863
4            chr1    3034470    3034472  0.407349      chr1:3034470-3034472
...           ...        ...        ...       ...                       ...
634499       chrX  170849226  170849228  0.500000  chrX:170849226-170849228
634500       chrX  170854561  170854563  0.000000  chrX:170854561-170854563
634501       chrX  170854561  170854563  0.000000  chrX:170854561-170854563
634502       chrX  170860316  170860318  0.000000  chrX:170860316-170860318
634503       chrX  170860316  170860318  0.000000  chrX:170860316-170860318

[634504 rows x 5 columns]
