Prepare genome files (fasta, gtf, etc) and build indices for use in other tests

In [None]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from pysradb import SRAweb

from Bio import SeqIO

In [4]:
prj_base_dir = Path.cwd().parent.parent

soft_dir = prj_base_dir / 'HIV_Atlas_Experiments' / 'soft'

sequence_dir = prj_base_dir / 'HIV_Atlas_Creation' / 'data' / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = prj_base_dir / 'HIV_Atlas_Creation' / 'data' / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

data_dir = prj_base_dir / 'HIV_Atlas_Experiments' / 'data'
hg38_fasta_fname = Path('~/genomes/human/hg38/hg38_p12_ucsc.fa').expanduser().resolve()
chess_gtf_fname = Path('~/genomes/human/hg38/annotations/chess3.1.3.GRCh38.gtf').expanduser().resolve()

In [5]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, str(soft_dir / "genomic_scripts"))
%aimport definitions

In [8]:
# method to generate all data given a host and pathogen fasta and gtf files
def generate_genome_indices(
    fn_out_dir: Union[str, Path],
    host_fasta_fname: Union[str, Path] = None,
    host_gtf_fname: Union[str, Path] = None,
    pathogen_fasta_fname: Union[str, Path] = None,
    pathogen_gtf_fname: Union[str, Path] = None,
):
    if host_fasta_fname is not None:
        assert os.path.exists(host_fasta_fname), f"host_fasta_fname does not exist: {host_fasta_fname}"
    if host_gtf_fname is not None:
        assert os.path.exists(host_gtf_fname), f"host_gtf_fname does not exist: {host_gtf_fname}"
    if pathogen_fasta_fname is not None:
        assert os.path.exists(pathogen_fasta_fname), f"pathogen_fasta_fname does not exist: {pathogen_fasta_fname}"
    if pathogen_gtf_fname is not None:
        assert os.path.exists(pathogen_gtf_fname), f"pathogen_gtf_fname does not exist: {pathogen_gtf_fname}"
    
    # create the out_dir if it does not exist
    fn_out_dir = Path(fn_out_dir)
    fn_out_dir.mkdir(parents=True, exist_ok=True)

    fn_reference_fasta_fname = fn_out_dir / 'reference.fasta'
    fn_reference_gtf_fname = fn_out_dir / 'reference.gtf'

    # create joint reference fasta
    with open(fn_reference_fasta_fname, 'w') as outFP:
        if host_fasta_fname is not None:
            for record in SeqIO.parse(host_fasta_fname.expanduser(), 'fasta'):
                SeqIO.write(record, outFP, 'fasta')
        if pathogen_fasta_fname is not None:
            for record in SeqIO.parse(pathogen_fasta_fname.expanduser(), 'fasta'):
                SeqIO.write(record, outFP, 'fasta')

    # create joint reference gtf
    with open(fn_reference_gtf_fname, 'w') as outFP:
        if host_gtf_fname is not None:
            with open(host_gtf_fname.expanduser(), 'r') as inFP:
                for line in inFP:
                    if line.startswith('#'):
                        continue
                    else:
                        outFP.write(line)
        if pathogen_gtf_fname is not None:
            with open(pathogen_gtf_fname.expanduser(), 'r') as inFP:
                for line in inFP:
                    if line.startswith('#'):
                        continue
                    else:
                        outFP.write(line)

    # index the new reference fasta
    subprocess.run(['samtools', 'faidx', fn_reference_fasta_fname])

    # build hisat2 index
    # extract junctions and exons files for the index construction
    fn_junctions_fname = fn_out_dir / 'reference.junctions'
    fn_exons_fname = fn_out_dir / 'reference.exons'

    subprocess.run(["hisat2_extract_splice_sites.py", fn_reference_gtf_fname], stdout=open(fn_junctions_fname, 'w'), check=True)
    print(f"Splice sites extracted to {fn_junctions_fname}")

    subprocess.run(["hisat2_extract_exons.py", fn_reference_gtf_fname], stdout=open(fn_exons_fname, 'w'), check=True)
    print(f"Exons extracted to {fn_exons_fname}")

    fn_reference_ht2_index_prefix = fn_out_dir / 'reference'
    cmd = ["hisat2-build", "-p", "64", "--ss", str(fn_junctions_fname), "--exon", str(fn_exons_fname), str(fn_reference_fasta_fname), str(fn_reference_ht2_index_prefix)]
    # subprocess.call(cmd)
    print(' '.join(cmd))

    # generate transcript to gene id mapping
    t2g = definitions.get_attribute(fn_reference_gtf_fname,"gene_id")
    t2g.rename(columns={'tid':"tx_id"}, inplace=True)
    t2g.to_csv(fn_out_dir / 'reference.t2g.tsv', sep='\t', index=False)

    # build salmon index
    # 1. extract transcript sequences
    fn_transcript_fasta_fname = fn_out_dir / 'reference.transcripts.fasta'
    cmd = f"gffread -w {fn_transcript_fasta_fname} -g {fn_reference_fasta_fname} {fn_reference_gtf_fname}"
    print(cmd)
    subprocess.run(cmd, shell=True, check=True)

In [None]:
# run for KT284371.1
KT284371_fasta_fname = prj_base_dir / 'HIV_Atlas_Creation/data/sequences/KT284371.fasta'
KT284371_gtf_fname = prj_base_dir / 'HIV_Atlas_Creation/data/annotation/KT284371/KT284371.vira.gtf'

generate_genome_indices(
    host_fasta_fname=hg38_fasta_fname,
    host_gtf_fname=chess_gtf_fname,
    pathogen_fasta_fname=KT284371_fasta_fname,
    pathogen_gtf_fname=KT284371_gtf_fname,
    fn_out_dir=data_dir / 'KT284371.1',
)

In [None]:
# run for K03455
k03455_fasta_fname = prj_base_dir / 'HIV_Atlas_Creation/data/reference/K03455.1.fasta'
k03455_gtf_fname = prj_base_dir / 'HIV_Atlas_Creation/data/reference/K03455.1.gtf'

generate_genome_indices(
    host_fasta_fname=hg38_fasta_fname,
    host_gtf_fname=chess_gtf_fname,
    pathogen_fasta_fname=k03455_fasta_fname,
    pathogen_gtf_fname=k03455_gtf_fname,
    fn_out_dir=data_dir / 'K03455.1',
)

Splice sites extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.junctions
Exons extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.exons
hisat2-build -p 64 --ss /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.junctions --exon /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.exons /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference
gffread -w /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.transcripts.fasta -g /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K03455.1/reference.gtf


In [None]:
# run for NL4-3 from the annotation experiment
nl43_fasta_fname = prj_base_dir / 'HIV_Atlas_Experiments/data/NL4-3.fasta'
nl43_gtf_fname = prj_base_dir / 'HIV_Atlas_Experiments/results/annotation_nl43/NL4-3.vira.gtf'

generate_genome_indices(
    host_fasta_fname=hg38_fasta_fname,
    host_gtf_fname=chess_gtf_fname,
    pathogen_fasta_fname=nl43_fasta_fname,
    pathogen_gtf_fname=nl43_gtf_fname,
    fn_out_dir=data_dir / 'NL43',
)

Splice sites extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.junctions
Exons extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.exons
hisat2-build -p 64 --ss /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.junctions --exon /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.exons /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference
gffread -w /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.transcripts.fasta -g /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/NL43/reference.gtf


In [None]:
# run for K02013.1 from the annotation experiment
K02013_fasta_fname = prj_base_dir / 'HIV_Atlas_Experiments/data/K02013.1.fasta'
K02013_gtf_fname = prj_base_dir / 'HIV_Atlas_Experiments/results/annotation_K02013.1/K02013.1.vira.gtf'

generate_genome_indices(
    host_fasta_fname=hg38_fasta_fname,
    host_gtf_fname=chess_gtf_fname,
    pathogen_fasta_fname=K02013_fasta_fname,
    pathogen_gtf_fname=K02013_gtf_fname,
    fn_out_dir=data_dir / 'K02013.1',
)

Splice sites extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.junctions
Exons extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.exons
hisat2-build -p 64 --ss /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.junctions --exon /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.exons /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference
gffread -w /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.transcripts.fasta -g /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/K02013.1/reference.gtf


In [9]:
# run for the PRJNA392230 custom genome
PRJNA392230_fasta_fname = prj_base_dir / 'HIV_Atlas_Experiments/data/PRJNA392230.fasta'
PRJNA392230_gtf_fname = prj_base_dir / 'HIV_Atlas_Experiments/results/annotation_PRJNA392230/PRJNA392230.vira.gtf'

# generate_genome_indices(
#     host_fasta_fname=hg38_fasta_fname,
#     host_gtf_fname=chess_gtf_fname,
#     pathogen_fasta_fname=PRJNA392230_fasta_fname,
#     pathogen_gtf_fname=PRJNA392230_gtf_fname,
#     fn_out_dir=data_dir / 'PRJNA392230',
# )

generate_genome_indices(
    host_fasta_fname=None,
    host_gtf_fname=None,
    pathogen_fasta_fname=PRJNA392230_fasta_fname,
    pathogen_gtf_fname=PRJNA392230_gtf_fname,
    fn_out_dir=data_dir / 'PRJNA392230_hivonly',
)

Splice sites extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.junctions
Exons extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.exons
hisat2-build -p 64 --ss /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.junctions --exon /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.exons /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference
gffread -w /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.transcripts.fasta -g /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/PRJNA392230_hivonly/reference.gtf


In [None]:
# run for M33262.1

# extract from gz and convert with gffread from gff to gtf
siv_gz_fname = "/ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Data/data/M33262.1/M33262.1.gff.gz"
siv239_fasta_fname = Path("/ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Data/data/M33262.1/M33262.1.fasta").expanduser().resolve()

assert os.path.exists(siv_gz_fname), f"siv_gz_fname does not exist: {siv_gz_fname}"

outdir = data_dir / "SIV239"
os.makedirs(outdir, exist_ok=True)

siv239_gtf_fname = outdir / "M33262.1.gtf"
cmd = f"zcat {siv_gz_fname} | gffread -T -F -o {siv239_gtf_fname} -"
subprocess.call(cmd, shell=True)

mmul_fasta_fname = Path('/ccb/salz4-4/avaraby/Kristen.ATAC_RNA/data/Macaca_mulatta.Mmul_10.dna.toplevel.upper.fasta').expanduser().resolve()
mmul_gtf_fname = Path('/ccb/salz4-4/avaraby/siv_chimera_protocol/data/host.merged.clean.gtf').expanduser().resolve()

generate_genome_indices(
    host_fasta_fname=mmul_fasta_fname,
    host_gtf_fname=mmul_gtf_fname,
    pathogen_fasta_fname=siv239_fasta_fname,
    pathogen_gtf_fname=siv239_gtf_fname,
    fn_out_dir=outdir,
)

Splice sites extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.junctions
Exons extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.exons
hisat2-build -p 64 --ss /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.junctions --exon /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.exons /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference
gffread -w /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.transcripts.fasta -g /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/SIV239/reference.gtf
