## Step1. Coordinate Matching

In [None]:
import re

# read id list
# When executing crispr_splice and crispr_delete, change the file path.
id_list = './test/crispri_id.txt'
gtf_file = './test/lncrna.gtf'
# change the output file path as needed.
custom_bed_file = './test/output/crispri_temp.bed' 
with open(id_list, 'r') as f:
    crispr_ids = set(line.strip() for line in f)

#  Process the GTF file.
def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else None

# Convert to a BED file.
with open(gtf_file, 'r') as f_in, \
     open(custom_bed_file, 'w') as f_out:
    for line in f_in:
        if line.startswith('#'):
            continue
        
        fields = line.strip().split('\t')
        if len(fields) < 9:
            continue
            
        chr_name = fields[0]
        start = str(int(fields[3]) - 1)  # BED 0-based
        end = fields[4]
        feature_type = fields[2]
        strand = fields[6]
        attributes = fields[8]

        gene_id = extract_attribute(attributes, 'gene_id')
        if gene_id not in crispr_ids:
            continue

        if feature_type == 'transcript':
            id_field = extract_attribute(attributes, 'transcript_id')
        elif feature_type == 'exon':
            id_field = extract_attribute(attributes, 'exon_id') 
        else:  # gene
            id_field = extract_attribute(attributes, 'gene_id')
        gene_id = extract_attribute(attributes, 'gene_id')
        bed_line = f"{chr_name}\t{start}\t{end}\t{gene_id}-{id_field}\t0\t{strand}\n"
        f_out.write(bed_line)

In [None]:
import re

# read id list
id_list = './test/crispr_delete_id.txt'
gtf_file = './test/gencode.v19.long_noncoding_RNAs.gtf'
# change the output file path as needed.
custom_bed_file = './test/output/crispr_delete_temp.bed' 
with open(id_list, 'r') as f:
    crispr_ids = set(line.strip() for line in f)

#  Process the GTF file.
def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else None

# Convert to a BED file.
with open(gtf_file, 'r') as f_in, \
     open(custom_bed_file, 'w') as f_out:
    for line in f_in:
        if line.startswith('#'):
            continue
        
        fields = line.strip().split('\t')
        if len(fields) < 9:
            continue
            
        chr_name = fields[0]
        start = str(int(fields[3]) - 1)  # BED 0-based
        end = fields[4]
        feature_type = fields[2]
        strand = fields[6]
        attributes = fields[8]

        gene_name = extract_attribute(attributes, 'gene_name')
        if gene_name not in crispr_ids:
            continue
        if feature_type == 'transcript':
            id_field = extract_attribute(attributes, 'transcript_id')
        elif feature_type == 'exon':
            id_field = extract_attribute(attributes, 'exon_id') # When generating sequences,change to using`transcript_id`.
        else:  # gene
            id_field = extract_attribute(attributes, 'gene_id')
        gene_id = extract_attribute(attributes, 'gene_id')

        bed_line = f"{chr_name}\t{start}\t{end}\t{gene_name}-{id_field}\t0\t{strand}\n"
        
        f_out.write(bed_line)

In [None]:
import re

# read id list
# When executing crispr_splice and crispr_delete, change the file path.
id_list = './test/crispr_splice_id.txt'
gtf_file = './test/gencode.v20.long_noncoding_RNAs.gtf'
# change the output file path as needed.
custom_bed_file = './test/output/crispr_splice_temp.bed' 
with open(id_list, 'r') as f:
    crispr_ids = set(line.strip() for line in f)

#  Process the GTF file.
def extract_attribute(attr_str, key):
    match = re.search(f'{key} "([^"]+)"', attr_str)
    return match.group(1) if match else None

# Convert to a BED file.
with open(gtf_file, 'r') as f_in, \
     open(custom_bed_file, 'w') as f_out:
    for line in f_in:
        if line.startswith('#'):
            continue
        
        fields = line.strip().split('\t')
        if len(fields) < 9:
            continue
            
        chr_name = fields[0]
        start = str(int(fields[3]) - 1)  # BED 0-based
        end = fields[4]
        feature_type = fields[2]
        strand = fields[6]
        attributes = fields[8]

        gene_name = extract_attribute(attributes, 'gene_name')
        if gene_name not in crispr_ids:
            continue
        if feature_type == 'transcript':
            id_field = extract_attribute(attributes, 'transcript_id')
        elif feature_type == 'exon':
            id_field = extract_attribute(attributes, 'exon_id') # When generating sequences,change to using`transcript_id`.
        else:  # gene
            id_field = extract_attribute(attributes, 'gene_id')
        gene_id = extract_attribute(attributes, 'gene_id')
        if gene_name == 'LINC00869' and gene_id == 'ENSG00000277147.1':
            continue  # 跳过不处理

        bed_line = f"{chr_name}\t{start}\t{end}\t{gene_name}-{id_field}\t0\t{strand}\n"
        
        f_out.write(bed_line)

In [None]:
# Process the raw file of crispr_casRx and convert it to a bed file
!python3 process_casrx.py

## Step2: Convert hg19 to hg38

In [None]:
import os

# flow,convert to hg38
os.system('../liftOver ./test/output/crispri_temp.bed ../hg19ToHg38.over.chain.gz ./test/output/crispri38_temp.bed ./test/output/unmap_crispri.bed')
os.system('../liftOver ./test/output/crispr_delete_temp.bed ../hg19ToHg38.over.chain.gz ./test/output/crispr_delete38.bed ./test/output/unmap_crispr_delete.bed')
os.system('grep -v -E "LH00477|LH02126|LH14878" ./test/output/crispri38_temp.bed > ./test/output/crispri38.bed')


## Step3: Merge all bed files


In [None]:
!cat ./test/output/crispr_splice_temp.bed ./test/output/crispr_delete38.bed ./test/output/crispri38.bed ./test/output/crispr_casrx38.bed > ./test/output/crispr_all.bed