In [8]:
import re
from collections import defaultdict

import aminos

## setup io

In [9]:
vcf = aminos.io.vcf.read_vcf('data/example.vcf.gz')
gff = aminos.io.gff.GFF('data/Homo_sapiens.GRCh38.110.chromosome.21.gff3.gz')
transcript_reference = aminos.io.fasta.read_transcript_references('data/reference_sequences.fasta.gz')

gff_transcripts = gff.get_unique_transcripts()

## iterate through transcripts

In [15]:
# Process the instructions for each transcript
for transcript in gff_transcripts:

    instruction_store = aminos.processing.instructions.Instructions()

    chr, start, end = gff.get_transcript_range(transcript)
    for record in vcf(f'{chr}:{start}-{end}'):
        bcsq = record.INFO.get('BCSQ')
        if bcsq is None:
            continue

        for csq in bcsq.split(','):
            csq_transcript, instruction = aminos.processing.csq.process_csq(csq)

            if csq_transcript != transcript or not instruction:
                continue

            for individual_call, individual in zip(record.genotypes, vcf.samples):
                for haplotype in [0, 1]:
                    if individual_call[haplotype] == 1:
                        instruction_store.add_sample_transcript_instruction(f'{individual}_{haplotype}', instruction)

    instruction_store.validate_all_instructions()

    file = aminos.io.fasta.Writer('data', transcript)

    reference_individuals = []

    for individual in vcf.samples:
        for haplotype in [0, 1]:
            sample_name = f'{individual}_{haplotype}'

            individual_instructions = instruction_store.get_sample_transcript_instructions(sample_name, transcript)

            if len(individual_instructions) == 0:
                reference_individuals.append(sample_name)
                continue
        
            tasks = aminos.processing.tasks.instructions_to_tasks(individual_instructions)
            new_sequence = aminos.processing.tasks.execute_individual_tasks(transcript_reference, tasks)

            file.write_header(sample_name)
            file.write_sequence(new_sequence)

    file.write_header(reference_individuals)
    file.write_sequence(transcript_reference)
    file.close()

inframe_deletion|TPTE|ENST00000622113|protein_coding|+|201IR>201I|10569526TAAG>T ('D', 201, 201, 'I', 2)
missense|TPTE|ENST00000622113|protein_coding|+|204R>203W|10569534C>T ('M', 204, 203, 'W', 1)
stop_gained|TPTE|ENST00000622113|protein_coding|+|211R>210*|10569701C>T ('G', 211, 210, '*', 1)
*missense|TPTE|ENST00000622113|protein_coding|+|368K>367E|10592359A>G ('N', 368, 367, 'E', 1)
missense|TPTE|ENST00000622113|protein_coding|+|368K>368E|10592359A>G ('M', 368, 368, 'E', 1)
*missense|TPTE|ENST00000622113|protein_coding|+|452L>451P|10602110T>C ('N', 452, 451, 'P', 1)
missense|TPTE|ENST00000622113|protein_coding|+|452L>452P|10602110T>C ('M', 452, 452, 'P', 1)
*missense|TPTE|ENST00000622113|protein_coding|+|464Y>463S|10602146A>C ('N', 464, 463, 'S', 1)
missense|TPTE|ENST00000622113|protein_coding|+|464Y>464S|10602146A>C ('M', 464, 464, 'S', 1)
inframe_deletion|TPTE|ENST00000427445|protein_coding|+|181IR>181I|10569526TAAG>T ('D', 181, 181, 'I', 2)
missense|TPTE|ENST00000427445|protein_co