## Step1:Handling BED file format for easy processing

In [24]:
import pandas as pd

def get_group_key(nameid):
        return nameid
# find transcript
def identify_transcript(group):
    return group.iloc[(group[2] - group[1]).argmax()]

def process_group(group):

    transcript = identify_transcript(group)
    # print(transcript)

    exons = group[group.index != transcript.name].copy()
    
    if len(exons) == 0:
        return None
        

    rel_starts = []
    lengths = []
    

    exons = exons.sort_values(by=[1])
    
    for _, exon in exons.iterrows():
        rel_start = exon[1] - transcript[1]
        length = exon[2] - exon[1]
        rel_starts.append(str(rel_start))
        lengths.append(str(length))
    
    result_row = transcript.copy()
    result_row[6] = ','.join(rel_starts)
    result_row[7] = ','.join(lengths)
    
    return pd.DataFrame([result_row])

# input the hg38 bed file
bed_file = 'crispr_casrx38.bed'
output_bed_file = 'seq_casrx.bed'
df = pd.read_csv(bed_file, sep='\t', header=None)
df['group_key'] = df[3].apply(get_group_key)
# print(df['group_key'].head(20))

results = []
for _, group in df.groupby('group_key'):
    result = process_group(group)
    if result is not None:
        results.append(result)

result = pd.concat(results)
result = result.drop('group_key', axis=1)
result.to_csv(output_bed_file, sep='\t', header=False, index=False)

## Step2: Generate sequence files at the gene level.

In [None]:
!python get_seq.py

## Step3: Trim and generate sequences

In [1]:
import pandas as pd

def parse_fasta_header(header):
    fields = header[1:].split(':')
    return {
        'gene_id': fields[0],
        'strand': int(fields[1]),
        'start': int(fields[2]) - 1,
        'end': int(fields[3])
    }

def get_transcript_seq(gene_seq, rel_starts, lengths, strand):
    """Obtain transcript sequences."""
    if strand == '-':
        gene_seq = gene_seq[::-1]  
    
    exon_seqs = []
    starts = [int(x) for x in rel_starts.split(',')]
    lens = [int(x) for x in lengths.split(',')]
    
    for start, length in zip(starts, lens):
        exon_seq = gene_seq[start:start+length]
        exon_seqs.append(exon_seq)
    
    transcript_seq = ''.join(exon_seqs)
    
    if strand == '-':
        transcript_seq = transcript_seq[::-1]  
    return transcript_seq

def process_files(fa_file, bed_file):
  
    gene_seqs = {}
    current_gene = None
    current_info = None
    
    with open(fa_file) as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                current_info = parse_fasta_header(line)
                current_gene = current_info['gene_id']
                gene_seqs[current_gene] = {'seq': '', 'info': current_info}
            else:
                gene_seqs[current_gene]['seq'] += line
    
   
    df = pd.read_csv(bed_file, sep='\t', header=None)
    
    with open('lncRNA2.fa', 'a') as outf:
        for _, row in df.iterrows():
            transcript_id = row[3].rsplit('-',1)[1]
            gene_id = row[3].rsplit('-',1)[0]
            if gene_id in gene_seqs:
                transcript_seq = get_transcript_seq(
                    gene_seqs[gene_id]['seq'],
                    row[6],
                    row[7],
                    row[5]
                )
                outf.write(f">{transcript_id}\n{transcript_seq}\n")

if __name__ == '__main__':
    # seq_splice.bed is the file generated in the previous step.
    process_files(fa_file='crispr_gene.fa', bed_file='seq_delete.bed')