In [21]:
import json
from collections import defaultdict
from typing import TypedDict

from src.utils import get_project_root

with open(
    get_project_root()
    / "inputs/msa"
    / "ncbiblast-R20250707-171143-0126-58639215-p1m.json",
    "r",
) as f:
    # read the whole file
    blast_data = json.load(f)

str(blast_data)[:500] + "..."

"{'program': 'blastp', 'version': 'BLASTP 2.16.0+', 'command': 'singularity exec $APPBIN/singularity/ncbiblast:2.16.0 /ncbiblast/bin/blastp -db &quot;uniprotkb_refprotswissprot&quot; -query ncbiblast-R20250707-171143-0126-58639215-p1m.sequence  -num_threads 32 -outfmt 11 -out ncbiblast-R20250707-171143-0126-58639215-p1m.archive -matrix BLOSUM62 -max_target_seqs 250 -evalue 10 -seg no -comp_based_stats F', 'query_def': 'EMBOSS_001', 'query_stype': 'protein', 'query_len': 976, 'db_count': 1, 'db_nu..."

In [22]:
key_types = defaultdict(set)
for blast_hit in blast_data["hits"]:
    for key, value in blast_hit.items():
        key_types[key].add(type(value).__name__)

hsps_key_types = defaultdict(set)
for blast_hit in blast_data["hits"]:
    for hsp in blast_hit["hit_hsps"]:
        for key, value in hsp.items():
            hsps_key_types[key].add(type(value).__name__)

key_types, hsps_key_types

(defaultdict(set,
             {'hit_num': {'int'},
              'hit_def': {'str'},
              'hit_db': {'str'},
              'hit_id': {'str'},
              'hit_acc': {'str'},
              'hit_desc': {'str'},
              'hit_url': {'str'},
              'hit_xref_url': {'str'},
              'hit_dbfetch_url': {'str'},
              'hit_os': {'str'},
              'hit_uni_de': {'str'},
              'hit_uni_os': {'str'},
              'hit_uni_ox': {'str'},
              'hit_uni_pe': {'str'},
              'hit_uni_sv': {'str'},
              'hit_len': {'int'},
              'hit_hsps': {'list'},
              'hit_uni_gn': {'str'}}),
 defaultdict(set,
             {'hsp_num': {'int'},
              'hsp_score': {'int'},
              'hsp_bit_score': {'float'},
              'hsp_expect': {'float'},
              'hsp_align_len': {'int'},
              'hsp_identity': {'float'},
              'hsp_positive': {'float'},
              'hsp_gaps': {'int'},
           

In [23]:
from pydantic import BaseModel


class HitHsps(BaseModel):
    hsp_num: int
    hsp_score: float
    hsp_bit_score: float
    hsp_expect: float
    hsp_align_len: int
    hsp_identity: float
    hsp_positive: float
    hsp_gaps: int
    hsp_query_frame: str
    hsp_hit_frame: str
    hsp_strand: str
    hsp_query_from: int
    hsp_query_to: int
    hsp_hit_from: int
    hsp_hit_to: int
    hsp_qseq: str
    hsp_mseq: str
    hsp_hseq: str


class BlastHit(BaseModel):
    hit_num: int
    hit_def: str
    hit_db: str
    hit_id: str
    hit_acc: str
    hit_desc: str
    hit_url: str
    hit_xref_url: str
    hit_dbfetch_url: str
    hit_os: str
    hit_uni_de: str
    hit_uni_os: str
    hit_uni_ox: str
    hit_uni_pe: str | None = None
    hit_uni_sv: str | None = None
    hit_len: int
    hit_hsps: list[HitHsps]
    hit_uni_gen: str | None = None


blast_hits = [BlastHit(**hit) for hit in blast_data["hits"]]

In [24]:
def blast_to_fasta(
    hits: list[BlastHit],
):
    base_sequence = hits[0].hit_hsps[0].hsp_qseq

    fasta = []

    fasta.append(f">query\n{base_sequence}")
    for hit in hits:
        fasta.append(f">{hit.hit_def}\n{hit.hit_hsps[0].hsp_hseq}")

    return "\n".join(fasta)


fasta_content = blast_to_fasta(blast_hits)

In [26]:
with open(get_project_root() / "inputs/msa" / "ryr1_rabit.a3m", "w") as f:
    f.write(fasta_content)