## Step 1: Convert GTF files to custom BED files.

In [None]:
import re

def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else "N.A."

def convert_gtf_to_bed(input_gtf, output_file):
    """
    Convert GTF file to BED format.
    """
    with open(input_gtf, 'r') as f_in, open(output_file, 'w') as f_out:
        for line in f_in:
            if line.startswith('#'):
                continue

            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue

            # annotations
            chr_name = fields[0]
            start = str(int(fields[3]) - 1)  
            end = fields[4]
            score = '.'
            strand = fields[6]
            feature_type = fields[2]
            attributes = fields[8]


            gene_id = extract_attribute(attributes, 'gene_id')
            transcript_id = extract_attribute(attributes, 'transcript_id')
            gene_name = extract_attribute(attributes, 'gene_name')


            id_field = gene_id if feature_type == 'gene' else transcript_id

            bed_line = f"{chr_name}\t{start}\t{end}\t{id_field}\t{score}\t{strand}\t{gene_id}\t{feature_type}\t{gene_name}\n"
            f_out.write(bed_line)
if __name__ == "__main__":
   convert_gtf_to_bed("./test/gencode.v47.long_noncoding_RNAs.gtf", "./test/output/gencodev47.bed")
   convert_gtf_to_bed("./test/NONCODEv6_human_hg38_lncRNA.gtf", "./test/output/noncode.bed")
   convert_gtf_to_bed("./test/lncRNA_LncBookv2.0_GRCh38.gtf", "./test/output/lncbook.bed")

In [None]:
# for NCBI GTF, we use a specific script to handle the lncRNA-only conversion
!python3 conBed.py ./test/GCF_000001405.40_GRCh38.p14_genomic.gtf ./test/output/ncbi.bed --lncrna-only


### Step1.1: Generate files for subsequent steps

In [4]:
import pandas as pd
def convert_bed_to_gene_mapping(bed_file):
    """
    Convert BED file to gene mapping format.
    """
    with open(bed_file) as f:
        first_line = f.readline()
        num_columns = len(first_line.strip().split('\t'))

    # Select column names based on column count
    if num_columns == 9:
        col_names = ['chr', 'start', 'end', 'trans_id', 'score', 'strand', 'GeneID', 'gene_type', 'gene_name']
        df = pd.read_csv(bed_file, sep='\t', header=None, names=col_names)
    elif num_columns == 12:
        col_names = ['chr', 'start', 'end', 'name', 'score', 'strand', 'gene_id', 'transcript_id', 'feature', 'gene_name', 'gene_biotype', 'GeneID']
        df = pd.read_csv(bed_file, sep='\t', header=None, names=['chr', 'start', 'end', 'name', 'score', 'strand', 'gene_id','transcript_id', 'gene_type', 'gene_name','gene_biotype','GeneID'])
    else:
        raise ValueError(f"Unexpected number of columns: {num_columns}")
    
    # df['gene_name'] = df['gene_name']
    

    mask = (df['gene_type'] == 'gene') & (df['gene_name'] != 'N.A.'& df['gene_name'])
    filtered_df = df[mask]

    result_df = filtered_df[['gene_name', 'GeneID']]
    result_df.to_csv('gene_mapping.txt', sep='\t', mode='a', index=False, header=False)
convert_bed_to_gene_mapping('./test/output/lncbook.bed')
convert_bed_to_gene_mapping('./test/output/ncbi.bed')

  df = pd.read_csv(bed_file, sep='\t', header=None, names=['chr', 'start', 'end', 'name', 'score', 'strand', 'gene_id','transcript_id', 'gene_type', 'gene_name','gene_biotype','GeneID'])


## Step2:Map public database gene IDs and other annotation information.

In [49]:
import os
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/lncbook.bed -wo -s -r -f 1 > ./test/output/lctemp.bed')
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/noncode.bed -wo -s -r -f 1 > ./test/output/nctemp.bed')
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/gencodev47.bed -wo -s -r -f 1 > ./test/output/gctemp.bed')
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/ncbi.bed -wo -s -r -f 1 > ./test/output/nbtemp.bed')

0

In [44]:
# Process the temporary BED file to generate a TSV file; remove duplicates by cumulative overlapping length
from collections import defaultdict
import os
def bed2tsv(input_temp_bed_file, result_file):
    """
    Convert a temporary BED file to a TSV file with specific fields.
    """
     # 
    primary_groups = defaultdict(lambda: defaultdict(dict))
    sums = defaultdict(lambda: defaultdict(int))


    with open(input_temp_bed_file, 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 16:  
                key = fields[3].rsplit('-', 1)[0]
                gene_id = fields[12]
                length = int(fields[15])
                transcript_id = fields[3].rsplit('-', 1)[1]


                col8 = fields[7]
                col9 = fields[8]
                deduplicate_key = (col8, col9)  

                value = (transcript_id, fields[9], fields[12], fields[14], fields[13])


                if deduplicate_key not in primary_groups[key][gene_id]:
                
                    primary_groups[key][gene_id][deduplicate_key] = value

                    sums[key][gene_id] += length


    with open('temp.tsv', 'w') as out:
        for key, gene_groups in primary_groups.items():

            if len(gene_groups) > 1:

                max_gene_id = max(gene_groups.keys(), key=lambda x: sums[key][x])

                for record in gene_groups[max_gene_id].values():
                    out.write(key+'\t'+'\t'.join(record) + '\n')
            else:

                for records in gene_groups.values():
                    for record in records.values():
                        out.write(key+'\t'+'\t'.join(record) + '\n')

    os.system('sort temp.tsv | uniq > ' + result_file)
if __name__ == "__main__":
    bed2tsv("./test/output/gctemp.bed", "./test/map/gencode_map.tsv")
    bed2tsv("./test/output/lctemp.bed", "./test/map/lncbook_map.tsv")
    bed2tsv("./test/output/nctemp.bed", "./test/map/noncode_map.tsv")


In [50]:
#for NCBI bed
!python3 pro1.py ./test/output/nbtemp.bed ./test/map/ncbi_map.tsv

Number of rows in new match data: 85358
Number of new targets: 356
Checking gene_id conflicts, only handling LOC/non-LOC mixed conflicts...
Handled 7 LOC/non-LOC mixed conflicts
✓ LOC conflict resolution report saved: loc_gene_conflict_resolution.csv
Calculating best gene mapping for each target (based on deduplicated exon cumulative length)...
Number of target-gene combinations: 362
✓ output: ./test/map/ncbi_map.tsv


In [52]:
# Define mapping between TSV files and their output BED files
tsv_to_bed = {
    './test/map/noncode_map.tsv': 'filtered_noncode_crispr.bed',
    './test/map/lncbook_map.tsv': 'filtered_lncbook_crispr.bed',
    './test/map/gencode_map.tsv': 'filtered_gencode_crispr.bed',
    './test/map/ncbi_map.tsv': 'filtered_ncbi_crispr.bed'
}

# Process each TSV file individually
for tsv_file, output_bed in tsv_to_bed.items():
    # Collect unique genes from first column of current TSV file
    exclude_genes = set()
    with open(tsv_file, 'r') as f:
    # Get unique genes from first column
        exclude_genes = set(line.split('\t')[0] for line in f)
    
    # Filter crispr_all.bed based on current exclusion list
    filtered_lines = []
    with open('./test/output/crispr_all.bed', 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if not fields:  # Skip empty lines
                continue
            
            # Extract gene name (part before last "-" in 4th column)
            try:
                gene_name = fields[3].rsplit('-', 1)[0]
            except IndexError:
                # Handle lines without "-" in 4th column
                gene_name = fields[3]
            
            # Keep line if gene not in current exclusion set
            if gene_name not in exclude_genes:
                filtered_lines.append(line)
    
    # Write filtered results to corresponding output file
    with open(output_bed, 'w') as f:
        f.writelines(filtered_lines)
    
    # Print statistics for current processing step
    print(f"Processed {tsv_file}:")
    print(f"  Excluded genes: {len(exclude_genes)}")
    print(f"  Output lines in {output_bed}: {len(filtered_lines)}\n")


Processed ./test/map/noncode_map.tsv:
  Excluded genes: 860
  Output lines in filtered_noncode_crispr.bed: 1640

Processed ./test/map/lncbook_map.tsv:
  Excluded genes: 1015
  Output lines in filtered_lncbook_crispr.bed: 1382

Processed ./test/map/gencode_map.tsv:
  Excluded genes: 760
  Output lines in filtered_gencode_crispr.bed: 2729

Processed ./test/map/ncbi_map.tsv:
  Excluded genes: 356
  Output lines in filtered_ncbi_crispr.bed: 7126



In [53]:
# Searching for gene entries where one exon is completely contained within another exon.
!bedtools intersect -a filtered_lncbook_crispr.bed -b ./test/output/lncbook.bed -wo -s -f 1 > res_lctemp.bed
!bedtools intersect -a filtered_noncode_crispr.bed -b ./test/output/noncode.bed -wo -s -f 1 > res_nctemp.bed
!bedtools intersect -a filtered_gencode_crispr.bed -b ./test/output/gencodev47.bed -wo -s -f 1 > res_gctemp.bed
!bedtools intersect -a filtered_ncbi_crispr.bed -b ./test/output/ncbi.bed -wo -s -f 1 > res_nbtemp.bed

In [None]:
# Process the temporary BED file to generate a TSV file; remove duplicates by cumulative overlapping length
from collections import defaultdict
import os
def bed2tsv(input_temp_bed_file, result_file):
    """
    Convert a temporary BED file to a TSV file with specific fields.
    """
     # 
    primary_groups = defaultdict(lambda: defaultdict(dict))
    sums = defaultdict(lambda: defaultdict(int))


    with open(input_temp_bed_file, 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 16:  
                key = fields[3].rsplit('-', 1)[0]
                gene_id = fields[12]
                length = int(fields[15])
                transcript_id = fields[3].rsplit('-', 1)[1]


                col8 = fields[7]
                col9 = fields[8]
                deduplicate_key = (col8, col9)  

                value = (transcript_id, fields[9], fields[12], fields[14], fields[13])


                if deduplicate_key not in primary_groups[key][gene_id]:
                
                    primary_groups[key][gene_id][deduplicate_key] = value

                    sums[key][gene_id] += length


    with open('temp.tsv', 'w') as out:
        for key, gene_groups in primary_groups.items():

            if len(gene_groups) > 1:

                max_gene_id = max(gene_groups.keys(), key=lambda x: sums[key][x])

                for record in gene_groups[max_gene_id].values():
                    out.write(key+'\t'+'\t'.join(record) + '\n')
            else:

                for records in gene_groups.values():
                    for record in records.values():
                        out.write(key+'\t'+'\t'.join(record) + '\n')

    os.system('sort temp.tsv | uniq > ' + result_file)
if __name__ == "__main__":
    bed2tsv("res_gctemp.bed", "./test/map/res_gencode_map.tsv")
    bed2tsv("res_lctemp.bed", "./test/map/res_lncbook_map.tsv")
    bed2tsv("res_nctemp.bed", "./test/map/res_noncode_map.tsv")


In [54]:
!python3 pro1.py res_nbtemp.bed ./test/map/res_ncbi_map.tsv

Number of rows in new match data: 4904
Number of new targets: 209
Checking gene_id conflicts, only handling LOC/non-LOC mixed conflicts...
Handled 1 LOC/non-LOC mixed conflicts
✓ LOC conflict resolution report saved: loc_gene_conflict_resolution.csv
Calculating best gene mapping for each target (based on deduplicated exon cumulative length)...
Number of target-gene combinations: 212
✓ output: ./test/map/res_ncbi_map.tsv


## Delete temporary files

In [55]:
!rm -f temp.tsv res_lctemp.bed res_nctemp.bed res_gctemp.bed res_nbtemp.bed 
!rm -f ./test/output/lctemp.bed ./test/output/nctemp.bed ./test/output/gctemp.bed ./test/output/nbtemp.bed
!rm -f filtered_lncbook_crispr.bed filtered_noncode_crispr.bed filtered_gencode_crispr.bed filtered_ncbi_crispr.bed


### The following code is verification test code and can be not executed.

In [None]:
# Temporary verification code
from collections import defaultdict


def  bed2tsv_temp(input_temp_bed_file, output_file):
    """
    Convert a temporary BED file to a TSV file with specific fields.
    """
    groups = defaultdict(set)  # 使用set自动去重
    
    with open(input_temp_bed_file, 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 15:
                key = fields[3].rsplit('-', 1)[0]
                transcript_id = fields[3].rsplit('-', 1)[1]
                value = (transcript_id, fields[9], fields[12], fields[14])
                
                groups[key].add(value)  # set自动去重
    
    with open(output_file, 'w') as out:
        for group_id, values in groups.items():
            for value in values:
                out.write(f'{group_id}\t{value[0]}\t{value[1]}\t{value[2]}\t{value[3]}\n')

if __name__ == "__main__":
    bed2tsv_temp("./test/output/gctemp.bed", "./test/map/gcmap.tsv")
    bed2tsv_temp("./test/output/lctemp.bed", "./test/map/lcmap.tsv")
    bed2tsv_temp("./test/output/nctemp.bed", "./test/map/ncmap.tsv")

In [None]:
# In the deduplication process,if multiple gene IDs are matched,manually select the one with the highest overlap ratio.
from collections import defaultdict

input_temp_tsv_file = './test/map/gcmap.tsv'
groups = defaultdict(set)


with open(input_temp_tsv_file, 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        if len(fields) >= 5:
            group_key = fields[0]  
            gene_id = fields[3]    
            groups[group_key].add(gene_id)


with open('tocheck_id.txt', 'a') as out:
    for group_key, gene_ids in groups.items():
        if len(gene_ids) > 1:
            out.write(f"{group_key}\t{','.join(gene_ids)}\n")