In [2]:
import os
import re
import sys
import csv
import time
import shutil
import random
import sqlite3
import requests
import subprocess
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from Bio import SeqIO

In [9]:
base_dir = Path.cwd().parent.parent
data_dir = base_dir / 'HIV_Atlas_Creation/data'

reference_data = {
    11676: {"accid": "K03455.1",
            "organism": "HIV-1",
            "fasta": data_dir / "reference/K03455.1.fasta",
            "gtf": data_dir / "reference/K03455.1.gtf",
            "description": 'Human immunodeficiency virus type 1 (HXB2), complete genome; HIV1/HTLV-III/LAV reference genome'},
    11723: {"accid": "M33262.1",
            "organism": "SIV",
            "fasta": data_dir / "reference/M33262.1.fasta",
            "gtf": data_dir / "reference/M33262.1.gtf",
            "description": 'Simian (macaque) immunodeficiency virus, isolate 239, complete proviral genome.'}
}
reference_ids = set([x["accid"] for x in reference_data.values()])

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = data_dir / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

outdir = data_dir / 'conservation'
outdir.mkdir(exist_ok=True)

accession_list_fname = data_dir / 'complete_sequences.2022.accessions'
summary_file =data_dir / "download.summary.tsv"

annotation_log_fname = data_dir / 'annotation/log.txt'
# load the annotation log file to extract relevant data into the database tsv
log_df = pd.read_csv(annotation_log_fname)

complete_log_df = log_df[log_df["status"] == "complete"].reset_index(drop=True)
complete_log_df.head()

Unnamed: 0,accid,organism,status,genome_score,junction_score,transcript_score,miniprot_score,guide_score
0,MH705157.1,HIV-1,complete,0.871767,0.944444,0.79909,0.6616,0.559152
1,DQ396400.1,HIV-1,complete,0.901976,0.958333,0.845619,0.661978,0.720496
2,KU168256.1,HIV-1,complete,0.878658,0.9375,0.819815,0.55941,0.71985
3,MH705158.1,HIV-1,complete,0.874285,0.958333,0.790237,0.407212,0.753737
4,MH705151.1,HIV-1,complete,0.82361,0.914352,0.732868,0.391839,0.564241


In [18]:
# load the junction information

# for every donor/acceptor site on the reference genome
# we should load counts of every nucleotide around it
# load list of genomes

def load_junctions(junction_stats_fname,sites):
    # loads data from the junction_stats_fname file and organizes it into the sites dictionary
    with open(junction_stats_fname, "r") as inFP:
        reader = csv.reader(inFP, delimiter="\t")
        # skip header
        next(reader)
        for lno, (position, query_position, site_type, map_consistency, sequence, query_sequence) in enumerate(reader):
            query_position = int(query_position)
            # sequence is offset+2bp site+offset
            assert len(sequence)%2==0, f"Error: sequence length is not even: {len(sequence)}"
            offset = int((len(sequence)-2)/2)
            start_pos = query_position - offset
            for pos in range(start_pos, start_pos+len(sequence)):
                sites[site_type].setdefault(pos, {"A":0,"C":0,"G":0,"T":0,"N":0})
                nt = sequence[pos-start_pos]
                if not nt in "ACGT":
                    nt = "N"
                sites[site_type][pos][nt] += 1
            

sites = {"acceptor":{},
         "donor":{}}
with open(summary_file,"r") as inFP:
    reader = csv.reader(inFP, delimiter="\t")
    # skip header
    next(reader)
    for lno, (accid, is_ref_genome, organism, taxid, fasta, gff, is_valid, ref_start, ref_end) in enumerate(reader):
        if accid in reference_ids:
            continue

        if (annotation_dir / accid / f"{accid}.vira.gtf").exists():
            # load line from the annotation log to get the score
            if len(log_df[log_df["accid"]==accid]) == 0:
                print(f"Warning: {accid} not found in log file, skipping")
                continue
            
            log_row = log_df[log_df["accid"]==accid]
            assert len(log_row) == 1, f"Error: multiple rows found for {accid}"

            status = log_row["status"].values[0]
            if not status == "complete":
                print(f"Warning: {accid} is not complete, skipping")
                continue
            
            if not int(taxid) == 11676:
                continue
            
            acc_outdir = annotation_dir / accid
            acc_outdir.mkdir(exist_ok=True)
            junction_stats_fname = acc_outdir / f"{accid}.vira.junction_stats.tsv"
            
            load_junctions(junction_stats_fname, sites)

In [22]:
# write this information out to files (donors and acceptors separately perhaps)
for site_type in sites:
    outfname = outdir / f"junctions_{site_type}.tsv"
    with open(outfname, "w") as outFP:
        outFP.write("seqid\tposition\tA\tC\tG\tT\tN\n")
        for pos in sorted(sites[site_type]):
            outFP.write(f"{reference_data[11676]['accid']}\t{pos}")
            for nt in "ACGTN":
                outFP.write(f"\t{sites[site_type][pos][nt]}")
            outFP.write("\n")