In [1]:
# getting data

In [1]:
import os
import re
import sys
import csv
import time
import pysam
import shutil
import random
import requests
import subprocess
from pathlib import Path

from Bio import SeqIO, Align, Entrez
from Bio.Align import substitution_matrices

import time

In [2]:
soft_dir = Path().resolve().parent / 'soft/vira/vira'
sys.path.append(str(soft_dir))

from utils.common import *

In [3]:
base_dir = Path.cwd().parent.parent / 'HIV_Atlas_Creation'
data_dir = base_dir / 'data'
outdir = data_dir / 'sequences'
outdir.mkdir(exist_ok=True)

hiv_reference_fasta_fname = data_dir / "reference/K03455.1.fasta"
hiv_reference_gtf_fname = data_dir / "reference/K03455.1.gtf"

siv_reference_fasta_fname = data_dir / "reference/M33262.1.fasta"
siv_reference_gtf_fname = data_dir / "reference/M33262.1.gtf"

references = {
    "HIV-1": {
        "accession_id": "K03455.1",
        "taxonomy_id": 11676,
        "fasta": hiv_reference_fasta_fname,
        "gtf": hiv_reference_gtf_fname
    },
    "SIV": {
        "accession_id": "M33262.1",
        "taxonomy_id": 11723,
        "fasta": siv_reference_fasta_fname,
        "gtf": siv_reference_gtf_fname
    }
}

Entrez.email = "ales.varabyou@jhu.edu"

accession_list_fname = data_dir / 'complete_sequences.2022.accessions'
summary_file = data_dir / "download.summary.tsv"

In [4]:
accessions = []
with open(accession_list_fname) as inFP:
    accessions = inFP.read().splitlines()
print(accessions[:5])

['K03455', 'MH705157', 'JQ403028', 'DQ396400', 'DQ676872']


In [5]:
# if any of the reference files are not in the accession list, add them
for ref in references.values():
    if ref["accession_id"] not in accessions:
        accessions.append(ref["accession_id"])

In [6]:
def get_data_from_all_accessions(accessions):
    """
    Fetch data for all accessions in a single query.
    """
    try:
        print(f"Fetching data for {len(accessions)} accessions...")
        # Join all accession IDs into a single request
        handle = Entrez.efetch(db="nucleotide", id=",".join(accessions), rettype="gb", retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        org_data_list = []

        for rec in records:
            try:
                org_data = {
                    "ACCESSION": rec["GBSeq_accession-version"],
                    "ORGANISM": rec["GBSeq_organism"],
                    "TAXID": None,
                }

                for feature in rec["GBSeq_feature-table"]:
                    if feature["GBFeature_key"] == "source":
                        for qualifier in feature["GBFeature_quals"]:
                            if qualifier["GBQualifier_name"] == "db_xref" and qualifier["GBQualifier_value"].startswith("taxon:"):
                                org_data["TAXID"] = int(qualifier["GBQualifier_value"].split(":")[1])
                                break

                if org_data["ORGANISM"] == "Human immunodeficiency virus 1":
                    org_data["ORGANISM"] = "HIV-1"
                elif org_data["ORGANISM"] == "Simian immunodeficiency virus":
                    org_data["ORGANISM"] = "SIV"
                else:
                    print(f"Organism {org_data['ORGANISM']} is not supported.")
                    continue

                if org_data["TAXID"] != references[org_data["ORGANISM"]]["taxonomy_id"]:
                    print(f"Taxonomy ID mismatch for {org_data['ORGANISM']}. Skipping.")
                    continue

                org_data_list.append(org_data)
            except Exception as rec_error:
                print(f"Error processing record: {rec_error}")

        return org_data_list

    except Exception as e:
        print(f"Error fetching data: {e}")
        return []


accessions = get_data_from_all_accessions(accessions)

Fetching data for 5382 accessions...
Taxonomy ID mismatch for HIV-1. Skipping.
Organism HIV-1 CRF03_AB is not supported.
Organism HIV-1 CRF04_cpx is not supported.
Organism HIV-1 M_02CD.KS069 is not supported.
Organism HIV-1 M_02CD.LBTB084 is not supported.
Organism HIV-1 M_02CD.MBTB047 is not supported.
Organism HIV-1 M_97CD.KTB119 is not supported.
Organism Human immunodeficiency virus type 1 04CD.FR.KZS is not supported.
Organism Human immunodeficiency virus is not supported.
Organism Human immunodeficiency virus is not supported.
Organism Human immunodeficiency virus is not supported.
Organism Human immunodeficiency virus is not supported.
Organism Human immunodeficiency virus is not supported.
Organism Human immunodeficiency virus is not supported.
Organism Human immunodeficiency virus is not supported.
Organism Human immunodeficiency virus is not supported.
Organism HIV-1 M_02CD.LBTB032 is not supported.
Organism HIV-1 M_97CD.KFE267 is not supported.
Organism HIV-1 M_97CD.MBFE250

In [7]:
fasta_url_template = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=fasta&id={}"
gff3_url_template = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id={}"

def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            return True
        else:
            return False
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
        return False
    except Exception as err:
        print(f"An error occurred: {err}")
        return False
    
def load_min_max_features(gtf_fname):
    # load minimum and maximum for each coordinate type
    minmax = dict()
    with open(gtf_fname,"r") as inFP:
        for line in inFP:
            if line[0] == "#":
                continue
            lcs = line.strip().split("\t")
            minmax.setdefault(lcs[2],[int(lcs[3]),int(lcs[4])])
            minmax[lcs[2]][0] = min(minmax[lcs[2]][0],int(lcs[3]))
            minmax[lcs[2]][1] = max(minmax[lcs[2]][1],int(lcs[4]))

    # add min donor and max acceptor
    introns = get_intervals(gtf_fname,"exon",invert=True)
    for k, data in introns.items():
        for intron, tids in data.items():
            minmax.setdefault("donor",[intron[0],intron[0]])
            minmax.setdefault("acceptor",[intron[1],intron[1]])
            minmax["donor"][0] = min(minmax["donor"][0],intron[0])
            minmax["donor"][1] = max(minmax["donor"][1],intron[0])
            minmax["acceptor"][0] = min(minmax["acceptor"][0],intron[1])
            minmax["acceptor"][1] = max(minmax["acceptor"][1],intron[1])

    return minmax

    
def is_valid_genome(qry_fasta_fname, ref_fasta_fname, ref_gtf_fname, outdir) -> bool:
    # maps query to reference.
    # finds 3' and 5' clipping
    # checks if clipping includes any annotated regions
    # return False if genome is incomplete

    # map to alignments and add missing 5' and 3' information

    # load min/max intron and cds
    minmax = load_min_max_features(ref_gtf_fname)

    # align two genomes
    cmd = ["minimap2","-ax","map-ont",ref_fasta_fname,qry_fasta_fname]
    q2t_sam_fname = outdir / "q2t.sam"
    with open(q2t_sam_fname,"w+") as outFP:
        subprocess.call(cmd,stdout=outFP,stderr=subprocess.DEVNULL)

    q2t_bam_fname = outdir / "q2t.bam"
    cmd = ["samtools","view","-S","-b","-o",q2t_bam_fname,q2t_sam_fname]
    subprocess.call(cmd,stdout=subprocess.DEVNULL,stderr=subprocess.DEVNULL)

    cmd = ["samtools","index",q2t_bam_fname]
    subprocess.call(cmd,stdout=subprocess.DEVNULL,stderr=subprocess.DEVNULL)

    # find 3' and 5' clippings
    bamfile = pysam.AlignmentFile(q2t_bam_fname, "rb")

    ref_start = None
    ref_end = None
    for read in bamfile:
        if not read.is_unmapped and not read.is_secondary and not (read.flag & 2048):
            ref_start = read.reference_start
            ref_end = read.reference_end

            break # should be just one alignment - exit after first. if mor ethan one - than likely fragmented and should not pass verification either
    
    if ref_start is None or ref_end is None or \
       ref_start > minmax["donor"][0] or \
       ref_end < minmax["acceptor"][1] or \
       ref_start > minmax["CDS"][0] or \
       ref_end < minmax["CDS"][1]:
        return False, ref_start, ref_end
    
    return True, ref_start, ref_end

In [9]:
summary_data = []
for acc in accessions:
    accid = acc["ACCESSION"]
    organism = acc["ORGANISM"]
    taxid = acc["TAXID"]
    
    is_ref_genome = accid in [ref["accession_id"] for ref in references.values()]
    
    fasta_url = fasta_url_template.format(accid)
    gff3_url = gff3_url_template.format(accid)

    fasta_file = outdir / f"{accid}.fasta"
    gff3_file = outdir / f"{accid}.gff3"

    fasta_downloaded = download_file(fasta_url, fasta_file)
    gff3_downloaded = False

    # load full accession id with version from the fasta file
    if not fasta_downloaded:
        continue

    tmpdir = outdir / f"{accid}.tmp"
    tmpdir.mkdir(exist_ok=True)
    is_valid, ref_start, ref_end = is_valid_genome(fasta_file,references[organism]["fasta"],references[organism]["gtf"],tmpdir)

    shutil.rmtree(tmpdir) # remove tmpdir
    if not is_valid:
        # remove fasta file
        os.remove(fasta_file)
        fasta_downloaded = False
        gff3_downloaded = False
    else:
        gff3_downloaded = download_file(gff3_url, gff3_file)

    is_ref_genome = 1 if is_ref_genome else 0
    is_valid = 1 if is_valid else 0
    fasta_downloaded = 1 if fasta_downloaded else 0
    gff3_downloaded = 1 if gff3_downloaded else 0
    ref_start = ref_start if ref_start else -1
    ref_end = ref_end if ref_end else -1
    summary_data.append([accid, is_ref_genome, organism, taxid, fasta_downloaded, gff3_downloaded, is_valid, ref_start, ref_end])

with open(summary_file, 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    writer.writerow(["accession_id", "is_ref_genome", "organism", "taxid", "fasta", "gff", "is_valid", "ref_start", "ref_end"])
    writer.writerows(summary_data)

In [10]:
def cleanup_hiv_gtf(input_gtf, output_gtf):
    # get cds chains for each transcript
    seqid = ""
    transcripts = {} # collect all unique CDS chains for each transcript
    tid2gid = {} # map transcript id to gene id
    with open(input_gtf, 'r') as inFP:
        for line in inFP:
            if line.startswith("#"):
                continue
        
            lcs = line.split("\t")

            attrs = extract_attributes(lcs[8])
            
            if lcs[2] == "transcript":
                gid = attrs["gene_id"]
                tid = attrs["transcript_id"]
                tid2gid[tid] = gid
                
            if lcs[2] != "CDS":
                continue

            seqid = lcs[0]
            tid = attrs["transcript_id"]
            if tid not in transcripts:
                transcripts[tid] = set()
            transcripts[tid].add(tuple([lcs[3],lcs[4],lcs[7]])) # start, end, phase

    # collect all unique CDS chains for each gene
    genes = {} # collect all unique CDS chains for each gene
    for tid, cds_chain in transcripts.items():
        gid = tid2gid[tid]
        if gid not in genes:
            genes[gid] = cds_chain
        assert genes[gid] == cds_chain, f"Gene {gid} has different CDS chains for different transcripts"

    # write out standardized GTF file
    with open(output_gtf+".tmp", 'w') as outFP:
        for gid, cds_chain in genes.items():
            for i, (start, end, phase) in enumerate(cds_chain):
                lcs = [seqid, "cleanup", "CDS", start, end, ".", "+", phase, "."]
                attrs = {"gene_id":gid,"transcript_id":gid}
                lcs[8] = to_attribute_string(attrs)
                outFP.write("\t".join(lcs)+"\n")

    # run gffread to generate the final gtf file
    cmd = ["gffread", "-E", "-T", "--force-exons", "-o", output_gtf, output_gtf+".tmp"]
    subprocess.run(cmd)

    # cleanup
    shutil.rmtree(output_gtf+".tmp")

In [None]:
# iterate over the gff files and standardize them:
# - run gffread
# - parse atributes and assign appropriate gene_id and transcript_id

with open(summary_file,"r") as inFP:
    reader = csv.reader(inFP, delimiter="\t")
    # skip header
    next(reader)
    for (accid, is_ref_genome, organism, taxid, fasta, gff, is_valid, ref_start, ref_end) in reader: 
        is_valid = int(is_valid)
        if not is_valid:
            continue

        is_ref_genome = int(is_ref_genome)
        taxid = int(taxid)
        fasta = int(fasta)
        gff = int(gff)
        ref_start = int(ref_start)
        ref_end = int(ref_end)
        
        if is_ref_genome:
            continue
        
        if not fasta or not gff or not is_valid:
            print(f"Skipping {accid}")
            continue
        
        gff3_file = os.path.join(outdir, f"{accid}.gff3")
        gffread_fname = os.path.join(outdir, f"{accid}.gffread.gtf")
        cmd = ["gffread", "-E", "-T", "-o", gffread_fname, gff3_file]
        subprocess.call(cmd)
        
        gtf_fname = os.path.join(outdir, f"{accid}.gtf")
        
        cleanup_hiv_gtf(gffread_fname, gtf_fname)

        # cleanup
        os.remove(gffread_fname)

Command line was:
gffread -E -T -o /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/MH705157.1.gffread.gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/MH705157.1.gff3
   .. loaded 10 genomic features from /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/MH705157.1.gff3
Command line was:
gffread -E -T --force-exons -o /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/MH705157.1.gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/MH705157.1.gtf.tmp
   .. loaded 8 genomic features from /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/MH705157.1.gtf.tmp
Command line was:
gffread -E -T -o /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/DQ396400.1.gffread.gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/DQ396400.1.gff3
   .. loaded 8 genomic features from /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Creation/data/sequences/DQ396400.1.gff3
Command line