In [5]:
import os

def filter_gene_sets(input_file, hvg_file, output_file):
    # 1. Load the High Variable Genes into a set for fast lookup
    print(f"Loading HVGs from {hvg_file}...")
    hvg_set = set()
    try:
        with open(hvg_file, 'r', encoding='utf-8') as f:
            for line in f:
                gene = line.strip()
                if gene:
                    hvg_set.add(gene)
    except FileNotFoundError:
        print(f"Error: Could not find {hvg_file}")
        return

    print(f"Filtering {input_file}...")
    retained_count = 0
    total_processed = 0

    try:
        with open(input_file, 'r', encoding='utf-8') as f_in, \
             open(output_file, 'w', encoding='utf-8') as f_out:
            
            for line in f_in:
                parts = line.strip().split('\t')
                if len(parts) < 3:
                    continue
                
                total_processed += 1
                name_full = parts[0]
                source = parts[1]
                genes = parts[2].split(',')
                gene_set = set(genes)
                
                # --- FILTER 1: Total gene count (15 to 500) ---
                total_count = len(gene_set)
                if not (15 <= total_count <= 500):
                    continue
                
                # --- FILTER 2: HVG intersection (at least 10) ---
                # Calculate the intersection of this set and the HVG list
                hvg_intersection = gene_set.intersection(hvg_set)
                
                if len(hvg_intersection) >= 10:
                    f_out.write(line)
                    retained_count += 1
                    
        print("\n" + "="*30)
        print("FILTERING RESULTS")
        print("="*30)
        print(f"Total sets processed:  {total_processed}")
        print(f"Sets retained:         {retained_count}")
        print(f"Sets discarded:        {total_processed - retained_count}")
        print(f"Output saved to:       {output_file}")

    except FileNotFoundError:
        print(f"Error: Could not find {input_file}")

# Run the filter
if __name__ == "__main__":
    filter_gene_sets(
        input_file='enriched_lncrna_sets.txt', 
        hvg_file='scvi_HVGs.txt', 
        output_file='hvg_filtered_lncrna_sets.txt'
    )

Loading HVGs from scvi_HVGs.txt...
Filtering enriched_lncrna_sets.txt...

FILTERING RESULTS
Total sets processed:  7743
Sets retained:         4527
Sets discarded:        3216
Output saved to:       hvg_filtered_lncrna_sets.txt


In [6]:
import os

def filter_and_sort_gene_sets(input_file, hvg_file, output_file):
    # 1. Load the High Variable Genes
    print(f"Loading HVGs from {hvg_file}...")
    hvg_set = set()
    try:
        with open(hvg_file, 'r', encoding='utf-8') as f:
            for line in f:
                gene = line.strip()
                if gene:
                    hvg_set.add(gene)
    except FileNotFoundError:
        print(f"Error: Could not find {hvg_file}")
        return

    # 2. Process and Filter
    retained_data = []
    total_processed = 0

    print(f"Filtering {input_file}...")
    try:
        with open(input_file, 'r', encoding='utf-8') as f_in:
            for line in f_in:
                parts = line.strip().split('\t')
                if len(parts) < 3:
                    continue
                
                total_processed += 1
                genes = set(parts[2].split(','))
                
                # Filter 1: Total size (15-500)
                if 15 <= len(genes) <= 500:
                    # Filter 2: HVG intersection (>= 10)
                    hvg_intersection = genes.intersection(hvg_set)
                    hvg_count = len(hvg_intersection)
                    
                    if hvg_count >= 10:
                        # Store the data and the count for sorting
                        retained_data.append({
                            'line': line.strip(),
                            'hvg_count': hvg_count
                        })

        # 3. Sort by HVG count (Descending: Most to Least)
        # We use a lambda function to tell Python to sort by the 'hvg_count' key
        retained_data.sort(key=lambda x: x['hvg_count'], reverse=True)

        # 4. Write to Output
        with open(output_file, 'w', encoding='utf-8') as f_out:
            for entry in retained_data:
                f_out.write(entry['line'] + '\n')

        print("\n" + "="*30)
        print("FILTERING & SORTING RESULTS")
        print("="*30)
        print(f"Total sets processed:  {total_processed}")
        print(f"Sets retained:         {len(retained_data)}")
        if retained_data:
            print(f"Max HVGs found:        {retained_data[0]['hvg_count']}")
            print(f"Min HVGs found:        {retained_data[-1]['hvg_count']}")
        print(f"Output saved to:       {output_file}")

    except FileNotFoundError:
        print(f"Error: Could not find {input_file}")

if __name__ == "__main__":
    filter_and_sort_gene_sets(
        input_file='enriched_lncrna_sets.txt', 
        hvg_file='scvi_HVGs.txt', 
        output_file='sorted_filtered_lncrna_sets.txt'
    )

Loading HVGs from scvi_HVGs.txt...
Filtering enriched_lncrna_sets.txt...

FILTERING & SORTING RESULTS
Total sets processed:  7743
Sets retained:         4527
Max HVGs found:        270
Min HVGs found:        10
Output saved to:       sorted_filtered_lncrna_sets.txt
