In [1]:
# running annotation protocol

In [1]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path

In [2]:
base_dir = Path.cwd().parent
data_dir = base_dir / 'data'

reference_fasta_fname = data_dir / 'reference' / 'K03455.1.fasta'
reference_gtf_fname = data_dir / 'reference' / 'K03455.1.gtf'

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

outdir = data_dir / 'annotation'
outdir.mkdir(exist_ok=True)

accession_list_fname = data_dir / 'complete_sequences.2022.accessions'
summary_file =data_dir / "download.summary.tsv"

In [3]:
# software links
soft_dir = base_dir / 'soft'
vira_bin = soft_dir / 'vira'
sam2gtf_bin = soft_dir / "sam2gtf/target/release/sam2gtf"
miniprot_bin = soft_dir / 'miniprot/miniprot'

In [7]:
# read summary tsv line by line using csv module
with open(summary_file,"r") as inFP:
    reader = csv.reader(inFP, delimiter="\t")
    # skip header
    next(reader)
    for lno, (accid, fasta, gff) in enumerate(reader):
        if accid == "K03455": # skip HXB2
            continue
        # if accid != "AB049811":
        #     continue
        # check if output file exists
        if (outdir / accid / f"{accid}.vira.gtf").exists():
            continue
        print(accid)
        
        fasta = bool(fasta)
        gff = bool(gff)
        if not fasta:
            print(f"Skipping {accid} as fasta file is not available")
            continue

        acc_outdir = outdir / accid
        acc_outdir.mkdir(exist_ok=True)
        output_gtf_fname = acc_outdir / f"{accid}.vira.gtf"
        stdout_fname = acc_outdir / f"{accid}.vira.stdout"
        stderr_fname = acc_outdir / f"{accid}.vira.stderr"
        tmp_dir = acc_outdir / f"{accid}.tmp"
        
        target_fasta_fname = sequence_dir / f"{accid}.fasta"
        # set guide to None if gff is False
        guide_gtf_fname = sequence_dir / f"{accid}.gtf" if gff else None

        os.environ['PYTHONPATH'] = str(vira_bin) + ":" + os.environ.get('PYTHONPATH', '')

        cmd = [sys.executable, '-m', 'vira',
               "--annotation",str(reference_gtf_fname),
                "--genome",str(reference_fasta_fname),
                "--target",str(target_fasta_fname),
                "--output",str(output_gtf_fname),
                "--sam2gtf",str(sam2gtf_bin),
                "--miniprot",str(miniprot_bin),
                "--guide",str(guide_gtf_fname),
                "--tmp-dir",str(tmp_dir)]
        if guide_gtf_fname is not None:
            cmd.extend(["--guide",str(guide_gtf_fname)])

        try:
            with open(stdout_fname, "w") as stdoutFP, open(stderr_fname, "w") as stderrFP:
                result = subprocess.run(
                    cmd,
                    cwd=vira_bin,
                    stdout=stdoutFP,  # Write stdout to file
                    stderr=stderrFP   # Write stderr to file
                )
        except subprocess.CalledProcessError as e:
            print("Error occurred:", e)
            print("Error output:\n", e.stderr)

# move deletion up or down by 1 position, making sure there is always an M next to the N

KU168281
AY169812
KU168288
KU168296
