In [1]:
import re
from collections import defaultdict

import aminos

## setup io

In [5]:
vcf = aminos.io.vcf.read_vcf('data/example.vcf.gz')
gff = aminos.io.gff.GFF('data/Homo_sapiens.GRCh38.110.chromosome.21.gff3.gz')
transcript_reference = aminos.io.fasta.init_transcript_reference('data/reference_sequences.fasta.gz')

gff_transcripts = gff.get_unique_transcripts()

## iterate through transcripts

In [6]:
mutation_dict = {
    "*frameshift": "R",
    "*frameshift&stop_retained": "Q",
    "*inframe_deletion": "C",
    "*inframe_insertion": "J",
    "*missense": "N",
    "*missense&inframe_altering": "K",
    "*stop_gained": "X",
    "*stop_gained&inframe_altering": "A",
    "frameshift": "F",
    "frameshift&start_lost": "V",
    "frameshift&stop_retained": "B",
    "inframe_deletion": "D",
    "inframe_deletion&stop_retained": "P",
    "inframe_insertion": "I",
    "inframe_insertion&stop_retained": "Z",
    "missense": "M",
    "missense&inframe_altering": "Y",
    "phi": "E",
    "start_lost": "0",
    "start_lost&splice_region": "U",
    "stop_gained": "G",
    "stop_gained&inframe_altering": "T",
    "stop_lost": "L",
    "stop_lost&frameshift": "W"
}

# take a bcftools csq string and return the transcript and instruction
# an instruction is defined as:
# (mutation_code, ref_pos, alt_pos, alt_seq, ref_seq_len)

def process_csq(csq):
    csq = csq.split('|')
    
    mutation_type = csq[0]
    if mutation_type not in mutation_dict:
        return None, None

    code = mutation_dict[mutation_type]

    transcript = csq[2]

    match = re.search(r"(\d+)([A-Za-z]+)>(\d+)([A-Za-z]+)", csq[5])

    if match:
        ref_pos, ref_seq, alt_pos, alt_seq = match.groups()

    else:
        return None, None
    
    return transcript, (code, ref_pos, alt_pos, alt_seq, len(ref_seq))

def instructions_to_tasks(instructions_ids, id_to_instruction):
    # task is defined as:
    # the stream code c, which describes the source of reading data, e.g., reference or
    # alteration. Second, the position p, which describes the start of the block in the input stream.
    # Third, the length l, which describes the length of the block and finally, the position in the new
    # sequence, r

    # task (c, p , l, r)
    # task (stream_code, position, length, new_position)

    tasks = []
    current_position = 0  # This represents the current position in the new sequence
    
    for instruction_id in sorted(instructions_ids):
        # Retrieve the instruction from the global dictionary using the ID
        instruction = id_to_instruction[instruction_id]
        # Parse the instruction string back into a tuple (mutation_code, ref_pos, alt_pos, alt_seq, ref_seq_len)
        code, ref_pos, alt_pos, alt_seq, ref_seq_len = eval(instruction)

        # Assuming 'R' for reference and 'A' for alteration
        # Add a task to copy from the reference up to the current mutation
        tasks.append(('R', current_position, int(ref_pos) - current_position, current_position))
        # Add a task for the mutation itself
        tasks.append(('A', int(alt_pos), len(alt_seq), current_position + (int(ref_pos) - current_position)))
        # Update the current position in the new sequence
        current_position = int(ref_pos) + len(alt_seq)

    tasks.append(('R', current_position, len(transcript_reference), current_position))

    return tasks

def execute_tasks(reference, tasks):
    # execute tasks
    # return new sequence
    return ...

# Initialize a counter for unique instruction IDs
instruction_id_counter = 0
instruction_to_id = {}

def get_instruction_id(instruction):
    global instruction_id_counter
    # Convert instruction tuple to a string to use as a dictionary key
    instruction_key = str(instruction)
    if instruction_key not in instruction_to_id:
        # Assign a new ID to the instruction if it's not already present
        instruction_to_id[instruction_key] = instruction_id_counter
        instruction_id_counter += 1
    return instruction_to_id[instruction_key]

# Process the instructions for each transcript
for transcript in gff_transcripts:
    instructions = defaultdict(set)

    chr, start, end = gff.get_transcript_range(transcript)
    for record in vcf(f'{chr}:{start}-{end}'):
        bcsq = record.INFO.get('BCSQ')
        if bcsq is None:
            continue

        for csq in bcsq.split(','):
            csq_transcript, instruction = process_csq(csq)
            if csq_transcript != transcript or not instruction:
                continue

            instruction_id = get_instruction_id(instruction)
            for individual_call, individual in zip(record.genotypes, vcf.samples):
                for haplotype in [0, 1]:
                    if individual_call[haplotype] == 1:
                        instructions[f'{individual}_{haplotype}'].add(instruction_id)

    # flip the instruction_to_id dict

    id_to_instruction = {id: instruction for instruction, id in instruction_to_id.items()}

    with open(f'data/{transcript}.txt', 'w') as file:

        reference_individuals = []

        for individual in vcf.samples:
            for haplotype in [0, 1]:
                sample_name = f'{individual}_{haplotype}'

                individual_instructions_ids = instructions[sample_name]

                if len(individual_instructions_ids) == 0:
                    reference_individuals.append(sample_name)
            
                tasks = instructions_to_tasks(individual_instructions_ids, id_to_instruction)
                new_sequence = execute_tasks(transcript_reference, tasks)

                file.write(f'>{sample_name}\n')
                file.write(f'{new_sequence}\n')

        file.write(f'>{",".join(reference_individuals)}\n')
        file.write(f'{transcript_reference}\n')
                
    # write reference:


KeyboardInterrupt: 