## Step 1: Convert GTF files to custom BED files.

In [21]:
import re

def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else "N.A."
input_gtf = 'gencode.v47.long_noncoding_RNAs.gtf' #gtf file path
output_file = 'gencodev47.bed'
with open(input_gtf, 'r') as f_in, open(output_file, 'w') as f_out:
    for line in f_in:
        if line.startswith('#'):
            continue
            
        fields = line.strip().split('\t')
        if len(fields) < 9:
            continue
            
        # annotations
        chr_name = fields[0]
        start = str(int(fields[3]) - 1)  
        end = fields[4]
        score = '.'
        strand = fields[6]
        feature_type = fields[2]
        attributes = fields[8]
        

        gene_id = extract_attribute(attributes, 'gene_id')
        transcript_id = extract_attribute(attributes, 'transcript_id')
        gene_name = extract_attribute(attributes, 'gene_name')
        

        id_field = gene_id if feature_type == 'gene' else transcript_id

        bed_line = f"{chr_name}\t{start}\t{end}\t{id_field}\t{score}\t{strand}\t{gene_id}\t{feature_type}\t{gene_name}\n"
        f_out.write(bed_line)

### Step1.1: Generate files for subsequent steps

In [1]:
import pandas as pd

df = pd.read_csv('lncbook.bed', sep='\t', header=None, names=['chr', 'start', 'end', 'trans_id', 'score', 'strand', 'gene_id', 'gene_type', 'gene_name'])
df['gene_name'] = df['gene_name'].str.split(',').str[0]

mask = (df['gene_type'] == 'gene') & (df['gene_name'] != 'N.A.')
filtered_df = df[mask]

result_df = filtered_df[['gene_name', 'gene_id']]


result_df.to_csv('gene_mapping.txt', sep='\t', index=False, header=False)

## Step2:Map public database gene IDs and other annotation information.

In [None]:
import os
os.system('bedtools intersect -a crispr_all.bed -b lncbook.bed -wo -s -r -f 1 > lctemp.bed')
os.system('bedtools intersect -a crispr_all.bed -b noncode.bed -wo -s -r -f 1 > nctemp.bed')
os.system('bedtools intersect -a crispr_all.bed -b gencodev47.bed -wo -s -r -f 1 > gctemp.bed')

In [18]:
# Filter the results processed by bedtools.
from collections import defaultdict

input_temp_bed_file = 'gctemp.bed'
output_file = 'gcmap.tsv'

groups = defaultdict(list)


with open(input_temp_bed_file, 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        if len(fields) >= 15:
            key = fields[3].rsplit('-', 1)[0]
            transcript_id = fields[3].rsplit('-', 1)[1]
            value = (transcript_id, fields[3], fields[9], fields[12], fields[14])
         
            if value not in groups[key]:
                groups[key].append(value)


with open(output_file, 'w') as out:
    for group_id, values in groups.items():
        for value in values:
            out.write(f'{group_id}\t{value[0]}\t{value[1]}\t{value[2]}\t{value[3]}\t{value[4]}\n')

In [20]:
# In the deduplication process,if multiple gene IDs are matched,manually select the one with the highest overlap ratio.
from collections import defaultdict

input_temp_tsv_file = 'gcmap.tsv'
groups = defaultdict(set)


with open(input_temp_tsv_file, 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        if len(fields) >= 5:
            group_key = fields[0]  
            gene_id = fields[4]    
            groups[group_key].add(gene_id)


with open('tocheck_id.txt', 'a') as out:
    for group_key, gene_ids in groups.items():
        if len(gene_ids) > 1:
            out.write(f"{group_key}\t{','.join(gene_ids)}\n")

In [1]:
from collections import defaultdict
import os

primary_groups = defaultdict(lambda: defaultdict(list))
sums = defaultdict(lambda: defaultdict(int))

input_temp_bed_file = 'nctemp.bed'

with open(input_temp_bed_file, 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        if len(fields) >= 16:
            key = fields[3].rsplit('-', 1)[0]
            gene_id = fields[12]
            length = int(fields[15])
            transcript_id = fields[3].rsplit('-', 1)[1]
            value = (transcript_id, fields[3], fields[9], fields[12], fields[14], fields[13])
           
            primary_groups[key][gene_id].append(value)
            sums[key][gene_id] += length


with open('processed_results.tsv', 'w') as out:
    for key, gene_groups in primary_groups.items():

        if len(gene_groups) > 1:

            max_gene_id = max(gene_groups.keys(), key=lambda x: sums[key][x])

            for record in gene_groups[max_gene_id]:
                out.write(key+'\t'+'\t'.join(record) + '\n')
        else:

            for records in gene_groups.values():
                for record in records:
                    out.write(key+'\t'+'\t'.join(record) + '\n')

os.system('sort processed_results.tsv | uniq > noncode_map.tsv')


0

In [None]:
# Read and extract unique genes from noncode_map.tsv/lncbook_map.tsv/gencode_map.tsv
with open('noncode_map.tsv', 'r') as f:
    # Get unique genes from first column
    exclude_genes = set(line.split('\t')[0] for line in f)

# Process crispr_all.bed and filter entries
filtered_lines = []
with open('crispr_all.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        # Extract gene name (part before last "-")
        gene_name = fields[3].rsplit('-', 1)[0]
        
        # Keep line only if gene not in exclude list
        if gene_name not in exclude_genes:
            filtered_lines.append(line)

# Write filtered results
with open('filtered_non_crispr.bed', 'w') as f:
    f.writelines(filtered_lines)

# Print statistics
print(f"Total exclude genes: {len(exclude_genes)}")
print(f"Output lines: {len(filtered_lines)}")

In [30]:
# Searching for gene entries where one exon is completely contained within another exon.
!bedtools intersect -a filtered_lnc_crispr.bed -b lncbook.bed -wo -s -f 1 > res_lctemp.bed
!bedtools intersect -a filtered_non_crispr.bed -b noncode.bed -wo -s -f 1 > res_nctemp.bed
!bedtools intersect -a filtered_gen_crispr.bed -b gencodev47.bed -wo -s -f 1 > res_gctemp.bed

In [36]:
import pandas as pd

# res_nctemp.bed and res_lctemp.bed
gene_hsalng_pairs = []
with open('res_gctemp.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        if fields[13] == 'exon' :
            gene = fields[3].rsplit('-', 1)[0]
            map_id = fields[12]
            length = int(fields[15])
            gene_name = fields[14]
            gene_hsalng_pairs.append([gene, map_id, gene_name, length])

# Convert to DataFrame for grouping operations
df = pd.DataFrame(gene_hsalng_pairs, columns=['gene', 'map_id', 'gene_name','length'])

# Find max sum groups
sums = df.groupby(['gene', 'map_id', 'gene_name'])['length'].sum().reset_index()
max_groups = (sums.sort_values('length', ascending=False)
             .groupby('gene')
             .first()
             .reset_index()
             [['gene', 'map_id', 'gene_name']])

# Save results
max_groups.to_csv('res_map.tsv', sep='\t', header=False, index=False,mode='a')

print(f"Processed pairs: {len(max_groups)}")

Processed pairs: 132
