In [None]:
# build a database for the web app populating it will all sequences

In [2]:
import os
import re
import sys
import csv
import time
import shutil
import random
import sqlite3
import requests
import subprocess
from pathlib import Path

In [10]:
base_dir = Path.cwd().parent
data_dir = base_dir / 'data'

reference_fasta_fname = data_dir / 'reference' / 'K03455.1.fasta'
reference_gtf_fname = data_dir / 'reference' / 'K03455.1.gtf'

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

outdir = data_dir / 'annotation'
outdir.mkdir(exist_ok=True)

release_dir = Path.cwd().parent.parent / 'HIV_Atlas_Data'
assert release_dir.exists(), f"release_dir does not exist: {release_dir}"
release_data_dir = release_dir / 'data'
release_data_dir.mkdir(exist_ok=True)
release_db_file = release_dir / 'hiv_atlas.db.tsv'

accession_list_fname = data_dir / 'complete_sequences.2022.accessions'
summary_file =data_dir / "download.summary.tsv"


In [4]:
def get_accid_from_fasta(fasta_fname):
    with open(fasta_fname, 'r') as f:
        for line in f:
            if line.startswith('>'):
                return line.split()[0][1:]
    return None

In [5]:
def build_genome_files(fasta_fname, gtf_fname, out_base_name):
    """Build the genome files for the database."""

    cmds = []

    # create copies of fasta and gtf files
    new_fasta_fname = f"{out_base_name}.fasta"
    new_gtf_fname = f"{out_base_name}.gtf"
    shutil.copy(fasta_fname, new_fasta_fname)
    shutil.copy(gtf_fname, new_gtf_fname)

    # create fasta index
    cmds.append(f"samtools faidx {new_fasta_fname}")

    # create sorted gff: gffread -F --keep-exon-attrs -o- K03455.1.gtf | sort -k1,1 -k4,4n > K03455.1.sorted.gff
    gff_fname = outdir / f"{out_base_name}.gff"
    cmds.append(f"gffread -F --keep-exon-attrs -o- {new_gtf_fname} | sort -k1,1 -k4,4n > {gff_fname}")

    # compress the gff file with bgzip - keep the original
    cmds.append(f"bgzip -k {gff_fname}")

    # create tabix index
    cmds.append(f"tabix -p gff {gff_fname}.gz")

    # return dictionary of file names
    return {
        'fasta': new_fasta_fname,
        'fai': f"{new_fasta_fname}.fai",
        'gtf': new_gtf_fname,
        'gff': f"{gff_fname}.gz",
        'tbi': f"{gff_fname}.gz.tbi",
        'cmds': "\n".join(cmds)
    }

In [11]:
# add K03455.1 as the reference genome
genome_db_outdir = release_data_dir / 'K03455.1'
genome_db_outdir.mkdir(exist_ok=True)

genome_data = build_genome_files(reference_fasta_fname, reference_gtf_fname, genome_db_outdir / 'K03455.1')
# write the commands to a file
with open(release_data_dir / 'commands.sh', 'w') as outFP:
    outFP.write(genome_data['cmds']+"\n")

In [None]:
# read summary tsv line by line using csv module
with open(release_data_dir / 'commands.sh', 'a') as outFP:
    with open(summary_file,"r") as inFP:
        reader = csv.reader(inFP, delimiter="\t")
        # skip header
        next(reader)
        for lno, (accid, fasta, gff) in enumerate(reader):
            if accid == "K03455": # skip HXB2
                continue

            if (outdir / accid / f"{accid}.vira.gtf").exists():
                acc_outdir = outdir / accid
                acc_outdir.mkdir(exist_ok=True)
                genome_fasta_fname = sequence_dir / f"{accid}.fasta"
                genome_gtf_fname = acc_outdir / f"{accid}.vira.gtf"
                
                fasta_accid = get_accid_from_fasta(genome_fasta_fname)
                assert fasta_accid is not None, f"Could not get accession id from fasta file: {genome_fasta_fname}"
                
                # create gff file, sort and compress and index
                genome_db_outdir = release_data_dir / fasta_accid
                genome_db_outdir.mkdir(exist_ok=True)

                genome_data = build_genome_files(genome_fasta_fname, genome_gtf_fname, genome_db_outdir / fasta_accid)
                # write the commands to a file
                with open(release_data_dir / 'commands.sh', 'a') as outFP:
                    outFP.write(genome_data['cmds']+"\n")
            


In [None]:
# build

def get_description(fasta_fname):
    """Get the description from the fasta file."""
    with open(fasta_fname) as inFP:
        for line in inFP:
            if line.startswith('>'):
                return line.strip().split(" ",1)[-1]
            
    return ""


# initialize manual db file
db_tsvFP = open(release_db_file, "w") # format: accid, is_reference, description, dir
 
genome_db_outdir = release_data_dir / 'K03455.1'
genome_db_outdir.mkdir(exist_ok=True)

reference_genome = {
    'accession_id': 'K03455.1',
    'description': 'Human immunodeficiency virus type 1 (HXB2), complete genome; HIV1/HTLV-III/LAV reference genome',
    'dir': str(genome_db_outdir),
    'is_reference': True
}
db_tsvFP.write(f"{reference_genome['accession_id']}\t1\t{reference_genome['description']}\n")

# read summary tsv line by line using csv module
with open(summary_file,"r") as inFP:
    reader = csv.reader(inFP, delimiter="\t")
    # skip header
    next(reader)
    for lno, (accid, fasta, gff) in enumerate(reader):
        if accid == "K03455": # skip HXB2
            continue

        if (outdir / accid / f"{accid}.vira.gtf").exists():
            acc_outdir = outdir / accid
            acc_outdir.mkdir(exist_ok=True)
            genome_fasta_fname = sequence_dir / f"{accid}.fasta"
            genome_gtf_fname = acc_outdir / f"{accid}.vira.gtf"
            
            fasta_accid = get_accid_from_fasta(genome_fasta_fname)
            assert fasta_accid is not None, f"Could not get accession id from fasta file: {genome_fasta_fname}"
            
            # create gff file, sort and compress and index
            genome_db_outdir = release_data_dir / fasta_accid
            # build 
            genome = {
                'accession_id': fasta_accid,
                'description': get_description(genome_fasta_fname),
                'dir': str(genome_db_outdir),
                'is_reference': False
            }

            db_tsvFP.write(f"{genome['accession_id']}\t0\t{genome['description']}\n")

db_tsvFP.close()

Existing database deleted.
Database schema created.
Inserted: K03455.1


In [None]:
# cleanup uncompressed gff and gtf files
for accid in os.listdir(release_data_dir):
    accdir = release_data_dir / accid
    if not accdir.is_dir():
        continue

    for fname in os.listdir(accdir):
        if fname.endswith(".gff") or fname.endswith(".gtf"):
            os.remove(accdir / fname)