In [6]:
 %cd /sci/labs/yotamd/lab_share/avishai.wizel/eRNA/

/sci/labs/yotamd/lab_share/avishai.wizel/eRNA


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [7]:
import anndata as ad
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import issparse, csr_matrix
import re
import warnings


In [8]:
adata_atac = ad.read_h5ad("./10X_PBMC/02_counts_data/adata_atac.h5ad")
adata_rna =  ad.read_h5ad("./10X_PBMC/02_counts_data/adata_rna.h5ad")

  utils.warn_names_duplicates("var")


In [9]:
def preprocess_atac_anndata(
    adata_atac: ad.AnnData,
    genome_assembly: str = "hg38", # "hg38" or "mm10"
    min_cells_per_peak: int = 5,
    max_cells_percentage_per_peak: float = 0.10
) -> ad.AnnData:
    """
    Performs filtering and binarization of an AnnData object containing scATAC-seq data.

    The input AnnData object is expected to have cells as rows (obs) and peaks as columns (var).

    Args:
        adata_atac (anndata.AnnData):
            The AnnData object for ATAC-seq data (cells x peaks).
        genome_assembly (str):
            Genome reference name ("hg38" or "mm10") for sex chromosome identification.
        min_cells_per_peak (int):
            Minimum number of cells a peak must appear in to be retained.
        max_cells_percentage_per_peak (float):
            Maximum percentage of cells a peak can appear in to be retained.

    Returns:
        anndata.AnnData: An updated AnnData object containing the filtered and binarized matrix and metadata.
                         The matrix (adata.X) will remain in 'cells x peaks' format.
    """

    if not isinstance(adata_atac, ad.AnnData):
        raise TypeError("Input 'adata_atac' must be an anndata.AnnData object.")
    
    # Ensure the matrix is sparse, as expected for ATAC data.
    # AnnData.X is typically already sparse (csr_matrix) if loaded from 10x H5.
    if not issparse(adata_atac.X):
        warnings.warn("adata_atac.X is not sparse. Converting to CSR sparse matrix.")
        adata_atac.X = csr_matrix(adata_atac.X)

    print(f"Initial AnnData shape (cells x peaks): {adata_atac.shape}")
    print(f"Initial number of cells: {adata_atac.shape[0]}")
    print(f"Initial number of peaks: {adata_atac.shape[1]}\n")

    # --- Filtering Steps ---

    ## 1. Binarize data: Replace all non-zero values with 1
    print("1. Binarizing data: Replacing all non-zero values with 1...")
    # .sign() converts positive values to 1, negative to -1, zero to 0.
    # For ATAC-seq, we expect non-negative values, so this effectively converts
    # any non-zero count to 1.
    adata_atac.X = adata_atac.X.sign()
    adata_atac.X.eliminate_zeros() # Remove explicitly stored zeros if any
    print(f"   Matrix non-zero elements after binarization: {adata_atac.X.nnz}\n")

    ## 2. Remove peaks on sex chromosomes
    print("2. Removing peaks on sex chromosomes...")
    
    sex_chroms_map = {'hg38': ['chrX', 'chrY'], 'mm10': ['chrX', 'chrY']}
    
    if genome_assembly.lower() not in sex_chroms_map:
        warnings.warn(f"Unsupported genome assembly '{genome_assembly}'. Skipping sex chromosome filtering.")
        is_sex_chrom = np.zeros(adata_atac.shape[1], dtype=bool)
    else:
        current_sex_chroms = sex_chroms_map[genome_assembly.lower()]
        # Extract chromosome from peak name (e.g., 'chr1:100-200' -> 'chr1')
        peak_chroms = np.array([p.split(':')[0] for p in adata_atac.var_names])
        is_sex_chrom = np.isin(peak_chroms, current_sex_chroms)
    
    non_sex_chrom_mask = ~is_sex_chrom
    adata_atac = adata_atac[:, non_sex_chrom_mask].copy() # Apply filter to var (columns)

    print(f"   Peaks remaining after sex chromosome removal: {adata_atac.shape[1]}\n")

    ## 3. Merge overlapping peaks (conceptual step - full implementation complex)
    # This step is highly dependent on how peaks are defined and requires genomic interval handling
    # (e.g., using libraries like pyranges, pybedtools, or specific ArchR/Signac functions).
    # It typically involves parsing peak coordinates, sorting, and then merging overlapping intervals.
    # For a minimal Python script without external genomic tools, full implementation is beyond scope.
    # Assuming this step might be handled externally or peaks are already non-overlapping for this context.
    print("3. Merging overlapping peaks... (This step requires dedicated genomic tools or advanced logic and is not fully implemented in this minimal example.)")
    print("   Assuming peaks are already non-overlapping or this step is handled externally for now.\n")


    ## 4. Remove peaks occurring in fewer than 'min_cells_per_peak' cells
    print(f"4. Removing peaks occurring in fewer than {min_cells_per_peak} cells...")
    
    # sum(axis=0) counts the number of cells each peak appears in
    # (since matrix is binary, sum of column is count of 1s)
    # .A1 converts the sparse matrix result to a 1D numpy array
    n_cells_per_peak = adata_atac.X.sum(axis=0).A1 
    
    peaks_to_keep_min_cells = n_cells_per_peak >= min_cells_per_peak
    adata_atac = adata_atac[:, peaks_to_keep_min_cells].copy()
    
    print(f"   Peaks remaining after minimum cell count filter: {adata_atac.shape[1]}\n")

    ## 5. Remove peaks occurring in more than 'max_cells_percentage_per_peak' of cells
    print(f"5. Removing peaks occurring in more than {max_cells_percentage_per_peak*100:.2f}% of cells...")
    
    n_cells_total = adata_atac.shape[0] # Total number of cells (rows in adata.X)
    
    # Recalculate cell counts per peak for the already filtered matrix
    n_cells_per_peak_current = adata_atac.X.sum(axis=0).A1
    
    # Calculate the maximum allowed cell count threshold
    max_cells_count_threshold = int(n_cells_total * max_cells_percentage_per_peak)
    
    peaks_to_keep_max_cells = n_cells_per_peak_current <= max_cells_count_threshold
    adata_atac = adata_atac[:, peaks_to_keep_max_cells].copy()
    
    print(f"   Peaks remaining after maximum cell percentage filter: {adata_atac.shape[1]}\n")

    print(f"Final AnnData shape (cells x peaks): {adata_atac.shape}")
    print(f"Final number of cells: {adata_atac.shape[0]}")
    print(f"Final number of peaks: {adata_atac.shape[1]}\n")

    return adata_atac


In [10]:
filtered_atac_adata = preprocess_atac_anndata(
            adata_atac=adata_atac,
            genome_assembly="hg38", # Make sure this matches your data
            min_cells_per_peak=100,
            max_cells_percentage_per_peak=0.10
        )


Initial AnnData shape (cells x peaks): (11909, 108377)
Initial number of cells: 11909
Initial number of peaks: 108377

1. Binarizing data: Replacing all non-zero values with 1...
   Matrix non-zero elements after binarization: 85596796

2. Removing peaks on sex chromosomes...
   Peaks remaining after sex chromosome removal: 105502

3. Merging overlapping peaks... (This step requires dedicated genomic tools or advanced logic and is not fully implemented in this minimal example.)
   Assuming peaks are already non-overlapping or this step is handled externally for now.

4. Removing peaks occurring in fewer than 100 cells...
   Peaks remaining after minimum cell count filter: 84531

5. Removing peaks occurring in more than 10.00% of cells...
   Peaks remaining after maximum cell percentage filter: 67207

Final AnnData shape (cells x peaks): (11909, 67207)
Final number of cells: 11909
Final number of peaks: 67207



In [11]:
def filter_rna_anndata(
    adata_rna: ad.AnnData,
    genome_assembly: str = "hg38", # "hg38" or "mm10"
    min_genes_per_cell: int = 200
) -> ad.AnnData:
    """
    Performs specific filtering of an AnnData object containing scRNA-seq data:
    1. Removes genes encoded on sex chromosomes.
    2. Removes cells expressing fewer than 'min_genes_per_cell' genes.

    The input AnnData object is expected to have cells as rows (obs) and genes as columns (var).

    Args:
        adata_rna (anndata.AnnData):
            The AnnData object for RNA-seq data (cells x genes).
        genome_assembly (str):
            Genome reference name ("hg38" or "mm10") for sex chromosome identification.
            Assumes gene names in adata.var_names can be parsed for chromosome information
            (e.g., if they contain 'chrX', 'chrY', or similar). If not, this step
            might require 'var' to have a 'chromosome' column.
        min_genes_per_cell (int):
            Minimum number of genes a cell must express to be retained.

    Returns:
        anndata.AnnData: An updated AnnData object containing the filtered matrix and metadata.
                         The matrix (adata.X) will remain in 'cells x genes' format.
    """

    if not isinstance(adata_rna, ad.AnnData):
        raise TypeError("Input 'adata_rna' must be an anndata.AnnData object.")
    
    # Ensure the matrix is sparse, typically CSR for cells x genes
    if not issparse(adata_rna.X):
        warnings.warn("adata_rna.X is not sparse. Converting to CSR sparse matrix.")
        adata_rna.X = csr_matrix(adata_rna.X)

    print(f"Initial AnnData shape (cells x genes): {adata_rna.shape}")
    print(f"Initial number of cells: {adata_rna.shape[0]}")
    print(f"Initial number of genes: {adata_rna.shape[1]}\n")

    # --- 1. Remove genes encoded on sex chromosomes ---
    print("1. Removing genes on sex chromosomes...")
    
    sex_chroms_map = {'hg38': ['chrX', 'chrY'], 'mm10': ['chrX', 'chrY']}
    
    if genome_assembly.lower() not in sex_chroms_map:
        warnings.warn(f"Unsupported genome assembly '{genome_assembly}'. Skipping sex chromosome filtering.")
        is_sex_chrom_gene = np.zeros(adata_rna.shape[1], dtype=bool)
    else:
        current_sex_chroms = sex_chroms_map[genome_assembly.lower()]
        
        # This part assumes gene names might contain chromosome info (e.g., 'chrX_GENE')
        # or that 'adata_rna.var' has a 'chromosome' column.
        # For typical 10x output, gene names are just gene symbols (e.g., 'XIST').
        # A more robust approach might be to map gene symbols to chromosomes using an external annotation.
        # For simplicity, assuming 'adata.var' might have a 'chromosome' column if available from the AnnData creation.
        
        # If 'chromosome' is a column in adata.var, use it:
        if 'chromosome' in adata_rna.var.columns:
            is_sex_chrom_gene = adata_rna.var['chromosome'].isin(current_sex_chroms)
            print("   Using 'chromosome' column from adata.var for sex chromosome filtering.")
        else:
            # Fallback: try to guess from gene names, or warn.
            # For 10x data, standard gene symbols often don't contain 'chrX' directly.
            # You might need a mapping for specific genes like 'XIST', 'TSIX', 'RPS4Y1', etc.
            # This example attempts to match 'chrX' or 'chrY' directly in gene names.
            # If your gene names are just symbols (e.g., 'XIST', 'SRY'), this might not work
            # and you'd need to provide a list of sex-linked genes.
            is_sex_chrom_gene = np.array([
                any(chrom_prefix in gene_name for chrom_prefix in current_sex_chroms)
                for gene_name in adata_rna.var_names
            ])
            # Add common sex-linked genes manually if not covered by prefix (example for hg38)
            if genome_assembly.lower() == "hg38":
                common_human_sex_genes = ['XIST', 'TSIX', 'SRY', 'RPS4Y1', 'DDX3Y', 'KDM5C', 'EIF1AY']
                is_sex_chrom_gene = is_sex_chrom_gene | adata_rna.var_names.isin(common_human_sex_genes)
            elif genome_assembly.lower() == "mm10":
                common_mouse_sex_genes = ['Xist', 'Tsix', 'Sry', 'Ddx3y', 'Kdm5c', 'Eif2s3y']
                is_sex_chrom_gene = is_sex_chrom_gene | adata_rna.var_names.isin(common_mouse_sex_genes)
            
            if not is_sex_chrom_gene.any():
                warnings.warn("   No 'chromosome' column found in adata.var, and no sex chromosome genes identified by name. Sex chromosome filtering might be ineffective.")

    non_sex_chrom_mask = ~is_sex_chrom_gene
    adata_rna = adata_rna[:, non_sex_chrom_mask].copy() # Apply filter to var (columns)

    print(f"   Genes remaining after sex chromosome removal: {adata_rna.shape[1]}\n")

    # --- 2. Remove cells expressing fewer than 'min_genes_per_cell' genes ---
    print(f"2. Removing cells expressing fewer than {min_genes_per_cell} genes...")
    
    # Calculate number of genes expressed per cell for the *current* matrix
    # (adata_rna.X > 0).sum(axis=1) counts non-zero entries per row (cells)
    adata_rna.obs['n_genes_after_sex_chrom_filter'] = (adata_rna.X > 0).sum(axis=1).A1

    cell_filter_mask = (adata_rna.obs['n_genes_after_sex_chrom_filter'] >= min_genes_per_cell)
    
    initial_cells = adata_rna.shape[0]
    adata_rna = adata_rna[cell_filter_mask, :].copy()
    print(f"   Cells remaining after minimum gene expression filter: {adata_rna.shape[0]} (removed {initial_cells - adata_rna.shape[0]} cells)\n")

    print(f"Final AnnData shape (cells x genes): {adata_rna.shape}")
    print(f"Final number of cells: {adata_rna.shape[0]}")
    print(f"Final number of genes: {adata_rna.shape[1]}\n")

    return adata_rna


In [12]:
filtered_rna_adata = filter_rna_anndata(
    adata_rna=adata_rna,
    genome_assembly="hg38", 
    min_genes_per_cell=200
)

Initial AnnData shape (cells x genes): (11909, 36601)
Initial number of cells: 11909
Initial number of genes: 36601

1. Removing genes on sex chromosomes...
   Genes remaining after sex chromosome removal: 36594

2. Removing cells expressing fewer than 200 genes...
   Cells remaining after minimum gene expression filter: 11852 (removed 57 cells)

Final AnnData shape (cells x genes): (11852, 36594)
Final number of cells: 11852
Final number of genes: 36594



  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [13]:
# filter atac cells based on filtered rna cells
filtered_cell_barcodes = filtered_rna_adata.obs_names
adata_atac_filtered = filtered_atac_adata[filtered_atac_adata.obs_names.isin(filtered_cell_barcodes), :].copy()
adata_atac_filtered = adata_atac_filtered[filtered_cell_barcodes, :].copy()


In [14]:
# reorder like rna
adata_atac_filtered = adata_atac_filtered[filtered_cell_barcodes, :].copy()

In [15]:
adata_atac_filtered

AnnData object with n_obs × n_vars = 11852 × 67207
    var: 'feature_id', 'feature_type', 'genome'

In [16]:
adata_atac_filtered.write_h5ad("./10X_PBMC/03_filtered_data/filtered_atac_adata.h5ad")

In [17]:
filtered_rna_adata.write_h5ad("./10X_PBMC/03_filtered_data/filtered_rna_adata.h5ad")