In [1]:
# running annotation protocol

In [1]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path

import numpy as np
import pandas as pd

In [3]:
base_dir = Path.cwd().parent
data_dir = base_dir / 'data'

reference_data = {
    11676: {"fasta": data_dir / "reference/K03455.1.fasta",
            "gtf": data_dir / "reference/K03455.1.gtf"},
    11723: {"fasta": data_dir / "reference/M33262.1.fasta",
            "gtf": data_dir / "reference/M33262.1.gtf"}
}

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

outdir = data_dir / 'annotation'
outdir.mkdir(exist_ok=True)

accession_list_fname = data_dir / 'complete_sequences.2022.accessions'
summary_file =data_dir / "download.summary.tsv"

logfile = outdir / 'log.txt'

In [4]:
# software links
soft_dir = base_dir / 'soft'
vira_bin = 'vira'
snapper_bin = 'snapper'
miniprot_bin = soft_dir / 'miniprot/miniprot'

In [5]:
soft_dir = Path().resolve().parent / 'soft/vira'  # Note: removed extra 'vira'
sys.path.append(str(soft_dir))

from vira.classes.txgroup import Transcriptome, Gene, Bundle
from vira.classes.transcript import Transcript, Object

In [6]:
def compute_genome_score(query_gtf_fname, target_gtf_fname, junction_stats_fname):
    # compute overall score for the quality of annotation

    assert os.path.exists(query_gtf_fname), f"query_gtf_fname does not exist: {query_gtf_fname}"
    assert os.path.exists(target_gtf_fname), f"target_gtf_fname does not exist: {target_gtf_fname}"
    assert os.path.exists(junction_stats_fname), f"junction_stats_fname does not exist: {junction_stats_fname}"

    # parse query_gtf_fname to get base stats (number of junctions, number of genes, number of transcripts, number of proteins)
    qry_tome = Transcriptome()
    qry_tome.build_from_file(query_gtf_fname)

    trg_tome = Transcriptome()
    trg_tome.build_from_file(target_gtf_fname)

    qry_donors = set()
    qry_acceptors = set()
    for qry_tx in qry_tome:
        for it in qry_tx.introns_it():
            qry_donors.add(it[0]-1)
            qry_acceptors.add(it[1])
    
    junctions_df = pd.read_csv(junction_stats_fname, sep='\t')
    matching_donors = 0
    matching_acceptors = 0
    for i, row in junctions_df.iterrows():
        # compute score of the position based on consistency and similarity of the flanking sequence
        consistency_score = row["map_consistency"]
        similarity_score = sum(a == b for a, b in zip(row["sequence"], row["query_sequence"])) / len(row["sequence"])
        total_pos_score = consistency_score * similarity_score
        if row["type"] == "donor" and row['query_position'] in qry_donors:
            matching_donors += total_pos_score
        if row["type"] == "acceptor" and row['query_position'] in qry_acceptors:
            matching_acceptors += total_pos_score

    junction_score = (matching_donors + matching_acceptors) / (len(qry_donors) + len(qry_acceptors))

    # next we need to process the gtf file to compute scores for transcripts and proteins
    transcript_score = 0
    miniprot_score = 0
    guide_score = 0
    for tx in trg_tome:
        qry_tx = qry_tome.get_by_tid(tx.tid)
        vira_tx_aln_score = tx.get_attr("vira_tx_aln_score")
        vira_tx_aln_score = float(vira_tx_aln_score) if vira_tx_aln_score is not None else 0

        miniprot_identity = tx.get_attr("identity_miniprot")
        miniprot_identity = float(miniprot_identity) if miniprot_identity is not None else 0
        miniprot_score += miniprot_identity

        guide_identity = tx.get_attr("identity_guide")
        guide_identity = float(guide_identity) if guide_identity is not None else 0
        guide_score += guide_identity

        has_cds = 1 if qry_tx.has_cds() == tx.has_cds() else 0

        transcript_score += (vira_tx_aln_score + miniprot_identity + guide_identity + has_cds) / 4

    transcript_score = transcript_score / len(qry_tome)
    miniprot_score = miniprot_score / len(qry_tome)
    guide_score = guide_score / len(qry_tome)

    genome_annotation_score = (junction_score + transcript_score) / 2

    return {
            "genome_score": genome_annotation_score,
            "junction_score": junction_score,
            "transcript_score": transcript_score,
            "miniprot_score": miniprot_score,
            "guide_score": guide_score
            }

In [None]:
# read summary tsv line by line using csv module

# Initialize log file
if not os.path.exists(logfile):
    with open(logfile, "w", newline="") as logFP:
        writer = csv.writer(logFP, quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["accid", "organism", "status", "genome_score", "junction_score", "transcript_score", "miniprot_score", "guide_score"])

with open(summary_file, "r") as inFP, open(logfile, "a", newline="") as logFP:
    reader = csv.reader(inFP, delimiter="\t")
    log_writer = csv.writer(logFP)
    # skip header
    next(reader)
    for lno, (accid, is_ref_genome, organism, taxid, fasta, gff, is_valid, ref_start, ref_end) in enumerate(reader):
        is_ref_genome = int(is_ref_genome)
        is_valid = int(is_valid)
        gff = int(gff)
        fasta = int(fasta)
        ref_start = int(ref_start)
        ref_end = int(ref_end)
        
        reference_fasta_fname = reference_data[int(taxid)]["fasta"]
        reference_gtf_fname = reference_data[int(taxid)]["gtf"]
        
        print(accid)

        if is_ref_genome:
            continue

        fasta = bool(fasta)
        gff = bool(gff)
        is_valid = bool(is_valid)
        if not fasta or not gff or not is_valid:
            log_writer.writerow([accid, organism, "skipped", "0", "0", "0", "0", "0"])
            logFP.flush()
            continue

        # check if output file exists and remove if it does
        if (outdir / accid / f"{accid}.vira.gtf").exists():
            os.remove(outdir / accid / f"{accid}.vira.gtf")

        acc_outdir = outdir / accid
        acc_outdir.mkdir(exist_ok=True)
        output_gtf_fname = acc_outdir / f"{accid}.vira.gtf"
        stdout_fname = acc_outdir / f"{accid}.vira.stdout"
        stderr_fname = acc_outdir / f"{accid}.vira.stderr"
        tmp_dir = acc_outdir / f"{accid}.tmp"
        
        target_fasta_fname = sequence_dir / f"{accid}.fasta"
        # set guide to None if gff is False
        guide_gtf_fname = sequence_dir / f"{accid}.gtf" if gff else None

        cmd = [vira_bin,
               "--annotation",str(reference_gtf_fname),
               "--genome",str(reference_fasta_fname),
               "--target",str(target_fasta_fname),
               "--output",str(output_gtf_fname),
               "--miniprot",str(miniprot_bin),
               "--snapper",str(snapper_bin),
               "--tmp-dir",str(tmp_dir)]
               
        if guide_gtf_fname is not None:
            cmd.extend(["--guide", str(guide_gtf_fname)])

        try:
            with open(stdout_fname, "w") as stdoutFP, open(stderr_fname, "w") as stderrFP:
                subprocess.run(cmd, stdout=stdoutFP, stderr=stderrFP, check=True)
            target_gtf_fname = output_gtf_fname
            junction_stats_fname = str(output_gtf_fname).rsplit(".",1)[0] + ".junction_stats.tsv"
            scores = compute_genome_score(reference_gtf_fname, output_gtf_fname, junction_stats_fname)
            log_writer.writerow([accid, organism, "complete", scores["genome_score"], scores["junction_score"], scores["transcript_score"], scores["miniprot_score"], scores["guide_score"]])
        except subprocess.CalledProcessError:
            # Read the error message from the stderr file
            with open(stderr_fname, "r") as stderrFP:
                error_message = stderrFP.read().strip()
            # Log only the last line of the error message
            log_writer.writerow([accid, organism, "failed", "0", "0", "0", "0", "0"])
        finally:
            logFP.flush()

        if tmp_dir.exists():
            for f in tmp_dir.iterdir():
                f.unlink()
            tmp_dir.rmdir()


K03455.1
MH705157.1
JQ403028.1
DQ396400.1
DQ676872.1
MH746258.1
MH746253.1
KU168256.1
MH705158.1
MH705151.1
MH705153.1
MH705133.1
KU168305.1
MN153491.1
KP718918.1
KP718928.1
MT349418.1
FJ388893.1
FJ388903.1
FJ388909.1
FJ388925.1
FJ388938.1
FJ388942.1
FJ388943.1
JF683759.1
JF683760.1
JF683767.1
JF683779.1
JF683782.1
JF683783.1
FJ670519.1
FJ670523.1
KY496622.1
MF109677.1
MF109427.1
MF109613.1
MF109623.1
KT152846.1
KT152839.1
KT152840.1
KT152841.1
KT152844.1
AF457052.1
AF457053.1
AF457055.1
AF457066.1
AF457067.1
AF457068.1
AF457069.1
AF457070.1
AF457077.1
AF457079.1
AF457080.1
AF457081.1
AF457083.1
AF457084.1
AF457086.1
AF457089.1
EU110095.1
EU110088.1
EU110092.1
EU110094.1
KT022360.1
KT022361.1
KT022363.1
KT022364.1
KT022365.1
KT022367.1
KT022368.1
KT022369.1
KT022370.1
KT022372.1
KT022373.1
KT022374.1
KT022375.1
KT022376.1
KT022377.1
KT022378.1
KT022380.1
KT022381.1
KT022382.1
KT022383.1
FJ623487.1
FJ623481.1
FJ623475.1
FJ623476.1
FJ623480.1
FJ623485.1
FJ623483.1
FJ623488.1
FJ623478.1
F