## Step 1: Convert GTF files to custom BED files.

In [None]:
import re

def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else "N.A."

def convert_gtf_to_bed(input_gtf, output_file):
    """
    Convert GTF file to BED format.
    """
    with open(input_gtf, 'r') as f_in, open(output_file, 'w') as f_out:
        for line in f_in:
            if line.startswith('#'):
                continue

            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue

            # annotations
            chr_name = fields[0]
            start = str(int(fields[3]) - 1)  
            end = fields[4]
            score = '.'
            strand = fields[6]
            feature_type = fields[2]
            attributes = fields[8]


            gene_id = extract_attribute(attributes, 'gene_id')
            transcript_id = extract_attribute(attributes, 'transcript_id')
            gene_name = extract_attribute(attributes, 'gene_name')


            id_field = gene_id if feature_type == 'gene' else transcript_id

            bed_line = f"{chr_name}\t{start}\t{end}\t{id_field}\t{score}\t{strand}\t{gene_id}\t{feature_type}\t{gene_name}\n"
            f_out.write(bed_line)
if __name__ == "__main__":
   convert_gtf_to_bed("./test/gencode.v47.long_noncoding_RNAs.gtf", "./test/output/gencodev47.bed")
   convert_gtf_to_bed("./test/NONCODEv6_human_hg38_lncRNA.gtf", "./test/output/noncode.bed")
   convert_gtf_to_bed("./test/lncRNA_LncBookv2.0_GRCh38.gtf", "./test/output/lncbook.bed")

In [None]:
# for NCBI GTF, we use a specific script to handle the lncRNA-only conversion
!python3 conBed.py ./test/GCF_000001405.40_GRCh38.p14_genomic.gtf ./test/output/ncbi.bed --lncrna-only


### Step1.1: Generate files for subsequent steps

In [None]:
import pandas as pd

df = pd.read_csv('lncbook.bed', sep='\t', header=None, names=['chr', 'start', 'end', 'trans_id', 'score', 'strand', 'gene_id', 'gene_type', 'gene_name'])
df['gene_name'] = df['gene_name'].str.split(',').str[0]

mask = (df['gene_type'] == 'gene') & (df['gene_name'] != 'N.A.')
filtered_df = df[mask]

result_df = filtered_df[['gene_name', 'gene_id']]


result_df.to_csv('gene_mapping.txt', sep='\t', index=False, header=False)

## Step2:Map public database gene IDs and other annotation information.

In [None]:
import os
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/lncbook.bed -wo -s  -f 1 > ./test/output/lctemp.bed')
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/noncode.bed -wo -s  -f 1 > ./test/output/nctemp.bed')
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/gencodev47.bed -wo -s  -f 1 > ./test/output/gctemp.bed')
os.system('bedtools intersect -a ./test/output/crispr_all.bed -b ./test/output/ncbi.bed -wo -s -f 1 > ./test/output/nbtemp.bed')

In [None]:
# Process the temporary BED file to generate a TSV file; remove duplicates by cumulative overlapping length
from collections import defaultdict
import os
def bed2tsv(input_temp_bed_file, result_file):
    """
    Convert a temporary BED file to a TSV file with specific fields.
    """
     # 
    primary_groups = defaultdict(lambda: defaultdict(dict))
    sums = defaultdict(lambda: defaultdict(int))
    gene_regions = defaultdict(lambda: defaultdict(lambda: [float('inf'), -float('inf')]))

    with open(input_temp_bed_file, 'r') as f:
        print("Processing file:", input_temp_bed_file)
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 16:  
                key = fields[3].rsplit('-', 1)[0]
                gene_id = fields[12]
                feature = fields[13]
                length = int(fields[15])
                transcript_id = fields[3].rsplit('-', 1)[1]
                

                col8 = fields[7]
                col9 = fields[8]
                deduplicate_key = (col8, col9)  
                mapped_gene_start = int(col8)
                mapped_gene_end = int(col9)
                current_gene_region = gene_regions[key][gene_id]
                if mapped_gene_start < current_gene_region[0]:
                    current_gene_region[0] = mapped_gene_start
                if mapped_gene_end > current_gene_region[1]:
                    current_gene_region[1] = mapped_gene_end

                value = (transcript_id, fields[9], fields[12], fields[14], fields[13])


                if deduplicate_key not in primary_groups[key][gene_id]:
                    primary_groups[key][gene_id][deduplicate_key] = value
                    if feature == 'exon':
                        sums[key][gene_id] += length

    def process_multi(key,gene_groups, sums, gene_region):
        def priority_sort(gene_id):
            sum_length = sums[key][gene_id]
            region_length = gene_region[key][gene_id][1] - gene_region[key][gene_id][0]
            covarage_ratio = sum_length / region_length if region_length > 0 else 0
            return (sum_length, covarage_ratio)
        return max(gene_groups, key=priority_sort)


    with open('temp.tsv', 'w') as out:
        print(len(primary_groups), "primary groups found")
        for key, gene_groups in primary_groups.items():

            if len(gene_groups) > 1:

                max_gene_id = process_multi(key,gene_groups, sums, gene_regions)

                for record in gene_groups[max_gene_id].values():
                    out.write(key+'\t'+'\t'.join(record) + '\n')
            else:

                for records in gene_groups.values():
                    for record in records.values():
                        out.write(key+'\t'+'\t'.join(record) + '\n')

    os.system('sort temp.tsv | uniq > ' + result_file)
    print(f"Results written to {result_file}")
if __name__ == "__main__":
    bed2tsv("./test/output/gctemp.bed", "./test/map/gencode_map.tsv")
    bed2tsv("./test/output/lctemp.bed", "./test/map/lncbook_map.tsv")
    bed2tsv("./test/output/nctemp.bed", "./test/map/noncode_map.tsv")


In [None]:
#for NCBI bed
!python3 pro1.py ./test/output/nbtemp.bed ./test/map/ncbi_map.tsv

## Delete temporary files

In [None]:
!rm -f temp.tsv 
!rm -f ./test/output/lctemp.bed ./test/output/nctemp.bed ./test/output/gctemp.bed ./test/output/nbtemp.bed

### The following code is verification test code and can be not executed.

In [None]:
# Temporary verification code
from collections import defaultdict


def  bed2tsv_temp(input_temp_bed_file, output_file):
    """
    Convert a temporary BED file to a TSV file with specific fields.
    """
    groups = defaultdict(set)  # 使用set自动去重
    
    with open(input_temp_bed_file, 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 15:
                key = fields[3].rsplit('-', 1)[0]
                transcript_id = fields[3].rsplit('-', 1)[1]
                value = (transcript_id, fields[9], fields[12], fields[14])
                
                groups[key].add(value)  # set自动去重
    
    with open(output_file, 'w') as out:
        for group_id, values in groups.items():
            for value in values:
                out.write(f'{group_id}\t{value[0]}\t{value[1]}\t{value[2]}\t{value[3]}\n')

if __name__ == "__main__":
    bed2tsv_temp("./test/output/gctemp.bed", "./test/map/gcmap.tsv")
    bed2tsv_temp("./test/output/lctemp.bed", "./test/map/lcmap.tsv")
    bed2tsv_temp("./test/output/nctemp.bed", "./test/map/ncmap.tsv")

In [None]:
# In the deduplication process,if multiple gene IDs are matched,manually select the one with the highest overlap ratio.
from collections import defaultdict

input_temp_tsv_file = './test/map/gcmap.tsv'
groups = defaultdict(set)


with open(input_temp_tsv_file, 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        if len(fields) >= 5:
            group_key = fields[0]  
            gene_id = fields[3]    
            groups[group_key].add(gene_id)


with open('tocheck_id.txt', 'a') as out:
    for group_key, gene_ids in groups.items():
        if len(gene_ids) > 1:
            out.write(f"{group_key}\t{','.join(gene_ids)}\n")