## Step1. Coordinate Matching

In [None]:
import re

# read id list
id_list = './test/crispri_id.txt'
gtf_file = './test/lncrna.gtf'
# change the output file path as needed.
custom_bed_file = './test/output/crispri_temp.bed' 
with open(id_list, 'r') as f:
    crispr_ids = set(line.strip() for line in f)

#  Process the GTF file.
def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else None

# Convert to a BED file.
with open(gtf_file, 'r') as f_in, \
     open(custom_bed_file, 'w') as f_out:
    for line in f_in:
        if line.startswith('#'):
            continue
        
        fields = line.strip().split('\t')
        if len(fields) < 9:
            continue
            
        chr_name = fields[0]
        start = str(int(fields[3]) - 1)  # BED 0-based
        end = fields[4]
        feature_type = fields[2]
        strand = fields[6]
        attributes = fields[8]

        gene_id = extract_attribute(attributes, 'gene_id')
        if gene_id not in crispr_ids:
            continue

        if feature_type == 'transcript':
            id_field = extract_attribute(attributes, 'transcript_id')
        elif feature_type == 'exon':
            id_field = extract_attribute(attributes, 'exon_id') 
        else:  # gene
            id_field = extract_attribute(attributes, 'gene_id')
        gene_id = extract_attribute(attributes, 'gene_id')
        bed_line = f"{chr_name}\t{start}\t{end}\t{gene_id}-{id_field}\t0\t{strand}\n"
        f_out.write(bed_line)

In [None]:
import re

# read id list
id_list = './test/crispr_delete_id.txt'
gtf_file = './test/gencode.v19.long_noncoding_RNAs.gtf'
# change the output file path as needed.
custom_bed_file = './test/output/crispr_delete_temp.bed' 
with open(id_list, 'r') as f:
    crispr_ids = set(line.strip() for line in f)

#  Process the GTF file.
def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else None

# Convert to a BED file.
with open(gtf_file, 'r') as f_in, \
     open(custom_bed_file, 'w') as f_out:
    for line in f_in:
        if line.startswith('#'):
            continue
        
        fields = line.strip().split('\t')
        if len(fields) < 9:
            continue
            
        chr_name = fields[0]
        start = str(int(fields[3]) - 1)  # BED 0-based
        end = fields[4]
        feature_type = fields[2]
        strand = fields[6]
        attributes = fields[8]

        gene_name = extract_attribute(attributes, 'gene_name')
        if gene_name not in crispr_ids:
            continue
        if feature_type == 'transcript':
            id_field = extract_attribute(attributes, 'transcript_id')
        elif feature_type == 'exon':
            id_field = extract_attribute(attributes, 'exon_id') 
        else:  # gene
            id_field = extract_attribute(attributes, 'gene_id')
        gene_id = extract_attribute(attributes, 'gene_id')

        bed_line = f"{chr_name}\t{start}\t{end}\t{gene_name}-{id_field}\t0\t{strand}\n"
        
        f_out.write(bed_line)

In [None]:
import re

# read id list
# When executing crispr_splice and crispr_delete, change the file path.
id_list = './test/crispr_splice_id.txt'
gtf_file = './test/gencode.v20.long_noncoding_RNAs.gtf'
# change the output file path as needed.
custom_bed_file = './test/output/crispr_splice_temp.bed' 
with open(id_list, 'r') as f:
    crispr_ids = set(line.strip() for line in f)

#  Process the GTF file.
def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else None

# Convert to a BED file.
with open(gtf_file, 'r') as f_in, \
     open(custom_bed_file, 'w') as f_out:
    for line in f_in:
        if line.startswith('#'):
            continue
        
        fields = line.strip().split('\t')
        if len(fields) < 9:
            continue
            
        chr_name = fields[0]
        start = str(int(fields[3]) - 1)  # BED 0-based
        end = fields[4]
        feature_type = fields[2]
        strand = fields[6]
        attributes = fields[8]

        gene_name = extract_attribute(attributes, 'gene_name')
        if gene_name not in crispr_ids:
            continue
        if feature_type == 'transcript':
            id_field = extract_attribute(attributes, 'transcript_id')
        elif feature_type == 'exon':
            id_field = extract_attribute(attributes, 'exon_id') 
        else:  # gene
            id_field = extract_attribute(attributes, 'gene_id')
        gene_id = extract_attribute(attributes, 'gene_id')
        if gene_name == 'LINC00869' and gene_id == 'ENSG00000277147.1':
            continue  

        bed_line = f"{chr_name}\t{start}\t{end}\t{gene_name}-{id_field}\t0\t{strand}\n"
        
        f_out.write(bed_line)

## Step2: Convert hg19 to hg38

In [None]:
import os

# flow,convert to hg38
os.system('../liftOver ./test/output/crispri_temp.bed ../hg19ToHg38.over.chain.gz ./test/output/crispri38_temp.bed ./test/output/unmap_crispri.bed')
os.system('../liftOver ./test/output/crispr_delete_temp.bed ../hg19ToHg38.over.chain.gz ./test/output/crispr_delete38.bed ./test/output/unmap_crispr_delete.bed')
os.system('grep -v -E "LH00477|LH02126|LH14878" ./test/output/crispri38_temp.bed > ./test/output/crispri38.bed')


## Step3: Merge Workflow


In [None]:
import os

# flow
os.system('bedtools intersect -a ./test/output/crispr_delete38.bed -b ./test/output/crispr_splice_temp.bed -wo -s -r -f 1 > temp1.bed ')
# Execute Step 2 to identify lncRNA entries that can be merged and those that require further inspection.
os.system('awk \'FNR==NR{print;next} {print}\' ./test/output/crispr_splice_temp.bed ./test/output/crispr_delete38.bed > crispr_temp1.bed')
os.system('bedtools intersect -a crispr_temp1.bed -b ./test/output/crispri38.bed -wo -s -r -f 1 > temp2.bed ')
# Execute Step 2 to identify lncRNA entries that can be merged and those that require further inspection.
os.system('awk \'FNR==NR{print;next} {print}\' crispr_temp1.bed ./test/output/crispri38.bed > crispr_temp2.bed')
os.system('bedtools intersect -a crispr_temp2.bed -b ./test/output/crispr_casrx38.bed -wo -s -r -f 1 > temp3.bed ')
# Execute Step 2 to identify lncRNA entries that can be merged and those that require further inspection.
os.system('awk \'FNR==NR{print;next} {print}\' crispr_temp2.bed ./test/output/crispr_casrx38.bed > ./test/output/crispr_all.bed')
#merge temp1.bed temp2.bed temp3.bed
os.system('cat temp1.bed temp2.bed temp3.bed | sort -u > temp.bed')
# 删除剩余文件
os.system('rm temp1.bed temp2.bed temp3.bed crispr_temp1.bed crispr_temp2.bed ./test/output/crispri38_temp.bed ./test/output/temp.bed')


## Step4: Merge lncRNA entries.
Identify entries that can be merged and entries that require further inspection.

In [None]:
from collections import defaultdict


groups = defaultdict(list)
with open('temp.bed', 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        if len(fields) >= 4:
            name_parts1 = fields[3].rsplit('-', 1)
            name_parts2 = fields[9].rsplit('-', 1)
            if len(name_parts1) == 2:
                gene_name = name_parts1[0]    # TMEM9B-AS1
                ense_id = name_parts1[1]
                lh_id = name_parts2[0]
                lht_id = name_parts2[1]
                mid =  gene_name+"="+ lh_id     # ENSE00002173967.1
                groups[mid].append(ense_id)


with open('tocheck.txt', 'a') as check_out, open('merge.txt', 'a') as merge_out:
    for mid, eids in groups.items():
        if all(eid.startswith('ENSE') for eid in eids):
            for eid in eids:
                check_out.write(f"{mid}:{eid}\n")
        elif all(eid.startswith('LH') for eid in eids) and all('.' in eid for eid in eids):
            for eid in eids:
                merge_out.write(f"{mid}-{eid}\n")
        else:
            merge_out.write(f"{mid}\n")