Prepare genome files (fasta, gtf, etc) and build indices for use in other tests

In [2]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from pysradb import SRAweb

from Bio import SeqIO

  from tqdm.autonotebook import tqdm


In [3]:
prj_base_dir = Path.cwd().parent.parent

sequence_dir = prj_base_dir / 'HIV_Atlas_Creation' / 'data' / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = prj_base_dir / 'HIV_Atlas_Creation' / 'data' / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

hiv_fasta_fname = sequence_dir / 'K03454.fasta'
hiv_gtf_fname = annotation_dir / 'K03454/K03454.vira.gtf'

data_dir = prj_base_dir / 'HIV_Atlas_Experiments' / 'data'
hg38_fasta_fname = Path('~/genomes/human/hg38/hg38_p12_ucsc.no_alts.no_fixs.fa')
chess_gtf_fname = Path('~/genomes/human/hg38/annotations/chess3.1.3.GRCh38.gtf')

In [4]:
reference_fasta_fname = data_dir / 'hg38_K03454.fasta'
reference_gtf_fname = data_dir / 'hg38_K03454.gtf'

reference_ht2_index_prefix = data_dir / 'hg38_K03454'

In [5]:
# create joint reference fasta
with open(reference_fasta_fname, 'w') as outFP:
    for record in SeqIO.parse(hg38_fasta_fname.expanduser(), 'fasta'):
        SeqIO.write(record, outFP, 'fasta')
    for record in SeqIO.parse(chess_gtf_fname.expanduser(), 'fasta'):
        SeqIO.write(record, outFP, 'fasta')

# create joint reference gtf
with open(reference_gtf_fname, 'w') as outFP:
    with open(chess_gtf_fname.expanduser(), 'r') as inFP:
        for line in inFP:
            if line.startswith('#'):
                continue
            else:
                outFP.write(line)
    with open(hiv_gtf_fname.expanduser(), 'r') as inFP:
        for line in inFP:
            if line.startswith('#'):
                continue
            else:
                outFP.write(line)

In [6]:
# build hisat2 index

# extract junctions and exons files for the index construction
junctions_fname = data_dir / 'hg38_K03454.junctions'
exons_fname = data_dir / 'hg38_K03454.exons'

subprocess.run(["hisat2_extract_splice_sites.py", reference_gtf_fname], stdout=open(junctions_fname, 'w'), check=True)
print(f"Splice sites extracted to {junctions_fname}")

subprocess.run(["hisat2_extract_exons.py", reference_gtf_fname], stdout=open(exons_fname, 'w'), check=True)
print(f"Exons extracted to {exons_fname}")

cmd = ["hisat2-build", "-p", "64", "--ss", str(junctions_fname), "--exon", str(exons_fname), str(reference_fasta_fname), str(reference_ht2_index_prefix)]
print(' '.join(cmd))

Splice sites extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.junctions
Exons extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.exons
hisat2-build -p 64 --ss /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.junctions --exon /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.exons /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454
