In [1]:
 %cd /sci/labs/yotamd/lab_share/avishai.wizel/eRNA/

/sci/labs/yotamd/lab_share/avishai.wizel/eRNA


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csc_matrix, issparse # csc_matrix or csr_matrix
import re # for identifying sex chromosomes
import warnings


In [15]:
def filter_scATAC_df(
    df_matrix,
    genome_assembly="hg38", # "hg38" or "mm10"
    min_cells_per_peak=5,
    max_cells_percentage_per_peak=0.10
):
    """
    Performs filtering and binarization of a scATAC-seq pandas DataFrame,
    and returns the result as a pandas DataFrame.

    The input DataFrame is expected to have peak names as its index and cell barcodes as its columns.

    Args:
        df_matrix (pandas.DataFrame):
            The data matrix, with peaks as rows (index) and cells as columns.
        genome_assembly (str):
            Genome reference name ("hg38" or "mm10") for sex chromosome identification.
        min_cells_per_peak (int):
            Minimum number of cells a peak must appear in to be retained.
        max_cells_percentage_per_peak (float):
            Maximum percentage of cells a peak can appear in to be retained.

    Returns:
        pandas.DataFrame: A filtered and binarized DataFrame.
                          The returned DataFrame will have cells as rows and peaks as columns.
                          (This is because AnnData works with cells x features, and it's common
                          to have cells as rows in the final processed matrix).
    """

    # --- Initial Checks and Conversion to Sparse Matrix ---
    if not isinstance(df_matrix, pd.DataFrame):
        raise TypeError("Input 'df_matrix' must be a pandas DataFrame.")

    # Extract peak names and cell barcodes from the DataFrame
    peak_names_initial = df_matrix.index.to_numpy()
    cell_barcodes_initial = df_matrix.columns.to_numpy()

    print(f"Initial DataFrame shape (peaks x cells): {df_matrix.shape}")
    print(f"Initial number of peaks: {df_matrix.shape[0]}")
    print(f"Initial number of cells: {df_matrix.shape[1]}\n")

    # Convert the DataFrame to a sparse matrix for efficient processing
    # df_matrix is peaks x cells, so convert directly to csc_matrix
    sparse_matrix = csc_matrix(df_matrix.values)
    del df_matrix # Free up memory as we now have sparse_matrix

    print("Converted DataFrame to sparse matrix for internal processing.\n")

    # Create an AnnData object for convenient handling during filtering.
    # AnnData's .X expects 'cells x features (peaks)' format.
    # Since input sparse_matrix is 'peaks x cells', we transpose it to 'cells x peaks'.
    adata = ad.AnnData(
        X=sparse_matrix.T.tocsr(), # Transpose and convert to CSR for efficient row (cell) slicing
        obs=pd.DataFrame(index=cell_barcodes_initial),
        var=pd.DataFrame(index=peak_names_initial)
    )
    print(f"AnnData object created for filtering. Shape (cells x peaks): {adata.shape}\n")

    # --- Filtering Steps (Same as before) ---

    ## 1. Binarize data: Replace all non-zero values with 1
    print("1. Binarizing data: Replacing all non-zero values with 1...")
    adata.X = adata.X.sign()
    adata.X.eliminate_zeros()
    print(f"   Matrix non-zero elements after binarization: {adata.X.nnz}\n")

    ## 2. Remove peaks on sex chromosomes
    print("2. Removing peaks on sex chromosomes...")
    
    sex_chroms_map = {'hg38': ['chrX', 'chrY'], 'mm10': ['chrX', 'chrY']}
    
    if genome_assembly.lower() not in sex_chroms_map:
        warnings.warn(f"Unsupported genome assembly '{genome_assembly}'. Skipping sex chromosome filtering.")
        is_sex_chrom = np.zeros(adata.shape[1], dtype=bool)
    else:
        current_sex_chroms = sex_chroms_map[genome_assembly.lower()]
        peak_chroms = np.array([p.split(':')[0] for p in adata.var_names])
        is_sex_chrom = np.isin(peak_chroms, current_sex_chroms)
    
    non_sex_chrom_mask = ~is_sex_chrom
    adata = adata[:, non_sex_chrom_mask].copy()

    print(f"   Peaks remaining after sex chromosome removal: {adata.shape[1]}\n")

    ## 3. Merge overlapping peaks (conceptual step - full implementation complex)
    print("3. Merging overlapping peaks... (This step requires dedicated genomic tools or advanced logic and is not fully implemented in this minimal example.)")
    print("   Assuming peaks are already non-overlapping or this step is handled externally for now.\n")


    ## 4. Remove peaks occurring in fewer than 'min_cells_per_peak' cells
    print(f"4. Removing peaks occurring in fewer than {min_cells_per_peak} cells...")
    
    n_cells_per_peak = adata.X.sum(axis=0).A1
    peaks_to_keep_min_cells = n_cells_per_peak >= min_cells_per_peak
    adata = adata[:, peaks_to_keep_min_cells].copy()
    
    print(f"   Peaks remaining after minimum cell count filter: {adata.shape[1]}\n")

    ## 5. Remove peaks occurring in more than 'max_cells_percentage_per_peak' of cells
    print(f"5. Removing peaks occurring in more than {max_cells_percentage_per_peak*100:.2f}% of cells...")
    
    n_cells_total = adata.shape[0]
    n_cells_per_peak_current = adata.X.sum(axis=0).A1
    max_cells_count_threshold = int(n_cells_total * max_cells_percentage_per_peak)
    
    peaks_to_keep_max_cells = n_cells_per_peak_current <= max_cells_count_threshold
    adata = adata[:, peaks_to_keep_max_cells].copy()
    
    print(f"   Peaks remaining after maximum cell percentage filter: {adata.shape[1]}\n")

    
    print(f"\nFinal DataFrame shape (cells x peaks): {adata.shape}")
    print(f"Final number of peaks: {adata.shape[1]}")
    print(f"Final number of cells: {adata.shape[0]}")

    return adata



In [13]:
sc_atac_data = pd.read_pickle('./10X_PBMC/02_counts_data/sc_atac_data.pkl')
# Run the preprocessing
filtered_adata = filter_scATAC_df(
    df_matrix=sc_atac_data,
    genome_assembly="hg38",
    min_cells_per_peak=5,
    max_cells_percentage_per_peak=0.10
)


Initial DataFrame shape (peaks x cells): (80234, 8633)
Initial number of peaks: 80234
Initial number of cells: 8633

Converted DataFrame to sparse matrix for internal processing.

AnnData object created for filtering. Shape (cells x peaks): (8633, 80234)

1. Binarizing data: Replacing all non-zero values with 1...
   Matrix non-zero elements after binarization: 51265105

2. Removing peaks on sex chromosomes...
   Peaks remaining after sex chromosome removal: 78219

3. Merging overlapping peaks... (This step requires dedicated genomic tools or advanced logic and is not fully implemented in this minimal example.)
   Assuming peaks are already non-overlapping or this step is handled externally for now.

4. Removing peaks occurring in fewer than 5 cells...
   Peaks remaining after minimum cell count filter: 77324

5. Removing peaks occurring in more than 10.00% of cells...
   Peaks remaining after maximum cell percentage filter: 62083


Final DataFrame shape (cells x peaks): (8633, 62083)


In [14]:
filtered_adata.write("./10X_PBMC/03_filtered_data/sc_atac_filtered.h5ad")