In [1]:
import pandas as pd
from pathlib import Path
import glob

def get_significant_genes_for_condition(condition, cell_type_group, rename_map, 
                                       csv_dir=".", adj_pval_threshold=0.05):
    """
    Get list of significant genes for a condition across multiple cell types.
    
    Parameters:
    -----------
    condition : str
        Perturbation condition (e.g., "Cul1")
    cell_type_group : str
        Grouped cell type name from rename_map values (e.g., "ctL6")
    rename_map : dict
        Mapping from original cell type names to grouped names
    csv_dir : str or Path
        Directory containing the CSV files
    adj_pval_threshold : float
        Adjusted p-value threshold for significance (default: 0.05)
    
    Returns:
    --------
    significant_genes : pd.DataFrame
        DataFrame with significant genes and their statistics
    """
    csv_dir = Path(csv_dir)
    
    # Get all original cell types that map to this group
    original_cell_types = [k for k, v in rename_map.items() if v == cell_type_group]
    
    print(f"Looking for {cell_type_group} (mapped from: {original_cell_types})")
    
    all_genes = []
    
    for cell_type in original_cell_types:
        # Construct filename pattern
        pattern = f"{cell_type}_Assign{condition}_*.csv"
        matching_files = list(csv_dir.glob(pattern))
        
        for file_path in matching_files:
            print(f"  Reading: {file_path.name}")
            
            # Read CSV
            df = pd.read_csv(file_path)
            
            # Filter for significant genes
            sig_genes = df[df['adj_pval'] < adj_pval_threshold].copy()
            
            # Add source information
            sig_genes['original_cell_type'] = cell_type
            sig_genes['source_file'] = file_path.name
            
            all_genes.append(sig_genes)
    
    if not all_genes:
        print(f"No files found for {cell_type_group} + {condition}")
        return pd.DataFrame()
    
    # Combine all significant genes
    combined = pd.concat(all_genes, ignore_index=True)
    
    # Get unique genes (in case same gene appears in multiple cell types)
    unique_genes = combined['Gene'].unique()
    
    print(f"Found {len(unique_genes)} unique significant genes")
    
    return combined

def get_gene_list_for_condition(condition, cell_type_group, rename_map, 
                                csv_dir=".", adj_pval_threshold=0.05):
    """
    Get just the list of unique significant gene names.
    
    Returns:
    --------
    gene_list : list
        List of unique gene names
    """
    df = get_significant_genes_for_condition(
        condition, cell_type_group, rename_map, csv_dir, adj_pval_threshold
    )
    
    if df.empty:
        return []
    
    return sorted(df['Gene'].unique().tolist())


# Usage example
rename_map = {
    "L6_CT_CTX": "ctL6",
    "L6b_CTX": "ctL6",
    "L5_PT_CTX": "PT", 
    "L5_NP_CTX": "NP",
    "L3_RSP-ACA": "itRSP",
    "L4_5_IT_CTX": "itRSP",
    "L2_3_IT_CTX-1": "itRSP",
    "L6_IT_CTX": "itRSP",
    "L5_IT_CTX": "itRSP",
    "L2_3_IT_PPP": "itRSP",
    "L2_3_IT_CTX-2": "itRSP",
    "Sst": "inhib",
    "Pvalb": "inhib",
}

# Get significant genes for ctL6 + Cul1
sig_genes_df = get_significant_genes_for_condition(
    condition="Cul1",
    cell_type_group="ctL6",
    rename_map=rename_map,
    csv_dir="/gpfs/home/asun/jin_lab/perturbench/raw_data/sig_deg"
)

# Or just get the gene list
gene_list = get_gene_list_for_condition(
    condition="Cul1",
    cell_type_group="ctL6",
    rename_map=rename_map,
    csv_dir="/gpfs/home/asun/jin_lab/perturbench/raw_data/sig_deg"
)

print(f"Significant genes: {gene_list[:10]}...")  # Show first 10

# Save to file
#sig_genes_df.to_csv("ctL6_Cul1_significant_genes.csv", index=False)

Looking for ctL6 (mapped from: ['L6_CT_CTX', 'L6b_CTX'])
  Reading: L6_CT_CTX_AssignCul1_0_sig.csv
Found 2100 unique significant genes
Looking for ctL6 (mapped from: ['L6_CT_CTX', 'L6b_CTX'])
  Reading: L6_CT_CTX_AssignCul1_0_sig.csv
Found 2100 unique significant genes
Significant genes: ['0610010F05Rik', '0610012G03Rik', '0610043K17Rik', '1110019D14Rik', '1110059E24Rik', '1500004A13Rik', '1700001O22Rik', '1700016P03Rik', '1700019D03Rik', '1700025G04Rik']...
