Prepare genome files (fasta, gtf, etc) and build indices for use in other tests

In [2]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from pysradb import SRAweb

from Bio import SeqIO

  from tqdm.autonotebook import tqdm


In [3]:
prj_base_dir = Path.cwd().parent.parent

soft_dir = prj_base_dir / 'HIV_Atlas_Experiments' / 'soft'

sequence_dir = prj_base_dir / 'HIV_Atlas_Creation' / 'data' / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = prj_base_dir / 'HIV_Atlas_Creation' / 'data' / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

hiv_fasta_fname = sequence_dir / 'K03454.fasta'
hiv_gtf_fname = annotation_dir / 'K03454/K03454.vira.gtf'

data_dir = prj_base_dir / 'HIV_Atlas_Experiments' / 'data'
hg38_fasta_fname = Path('~/genomes/human/hg38/hg38_p12_ucsc.no_alts.no_fixs.fa')
chess_gtf_fname = Path('~/genomes/human/hg38/annotations/chess3.1.3.GRCh38.gtf')

In [4]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, str(soft_dir / "genomic_scripts"))
%aimport definitions

In [5]:
reference_fasta_fname = data_dir / 'hg38_K03454.fasta'
reference_gtf_fname = data_dir / 'hg38_K03454.gtf'

reference_ht2_index_prefix = data_dir / 'hg38_K03454'

In [5]:
# create joint reference fasta
with open(reference_fasta_fname, 'w') as outFP:
    for record in SeqIO.parse(hg38_fasta_fname.expanduser(), 'fasta'):
        SeqIO.write(record, outFP, 'fasta')
    for record in SeqIO.parse(hiv_fasta_fname.expanduser(), 'fasta'):
        SeqIO.write(record, outFP, 'fasta')

# create joint reference gtf
with open(reference_gtf_fname, 'w') as outFP:
    with open(chess_gtf_fname.expanduser(), 'r') as inFP:
        for line in inFP:
            if line.startswith('#'):
                continue
            else:
                outFP.write(line)
    with open(hiv_gtf_fname.expanduser(), 'r') as inFP:
        for line in inFP:
            if line.startswith('#'):
                continue
            else:
                outFP.write(line)

In [6]:
# build hisat2 index

# extract junctions and exons files for the index construction
junctions_fname = data_dir / 'hg38_K03454.junctions'
exons_fname = data_dir / 'hg38_K03454.exons'

subprocess.run(["hisat2_extract_splice_sites.py", reference_gtf_fname], stdout=open(junctions_fname, 'w'), check=True)
print(f"Splice sites extracted to {junctions_fname}")

subprocess.run(["hisat2_extract_exons.py", reference_gtf_fname], stdout=open(exons_fname, 'w'), check=True)
print(f"Exons extracted to {exons_fname}")

cmd = ["hisat2-build", "-p", "64", "--ss", str(junctions_fname), "--exon", str(exons_fname), str(reference_fasta_fname), str(reference_ht2_index_prefix)]
print(' '.join(cmd))

Splice sites extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.junctions
Exons extracted to /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.exons
hisat2-build -p 64 --ss /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.junctions --exon /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.exons /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454


In [12]:
# generate transcript to gene id mapping
t2g = definitions.get_attribute(reference_gtf_fname,"gene_id")
t2g.rename(columns={'tid':"tx_id"}, inplace=True)
t2g.to_csv(data_dir / 'hg38_K03454.t2g.tsv', sep='\t', index=False)
t2g.head()

Unnamed: 0,tx_id,gene_id
0,CHS.1.1,CHS.1
1,CHS.2.1,CHS.2
2,CHS.166734.2,CHS.166734
3,CHS.166734.1,CHS.166734
4,CHS.166734.3,CHS.166734


In [10]:
# prep mudskipper index
cmd = f"{soft_dir}/mudskipper/target/release/mudskipper index --gtf {reference_gtf_fname} --dir-index {data_dir / 'hg38_K03454.mudskipper'}"
print(cmd)
subprocess.run(cmd, shell=True, check=True)

/ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/soft/mudskipper/target/release/mudskipper index --gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.gtf --dir-index /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.mudskipper


[2024-12-19T19:12:11Z INFO  mudskipper] Mudskipper started...
[2024-12-19T19:12:11Z INFO  mudskipper::annotation] reading the gtf file and building the tree.
[2024-12-19T19:12:20Z INFO  mudskipper::annotation] building the tree
[2024-12-19T19:12:20Z INFO  mudskipper::annotation] Time to build the tree: 8.577206464s
[2024-12-19T19:12:20Z INFO  mudskipper::annotation] saving the GTF index
[2024-12-19T20:07:19Z INFO  mudskipper::annotation] Done with saving the GTF index
[2024-12-19T20:07:19Z INFO  mudskipper] Mudskipper finished.


CompletedProcess(args='/ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/soft/mudskipper/target/release/mudskipper index --gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.gtf --dir-index /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.mudskipper', returncode=0)

In [None]:
# build salmon index
# 1. extract transcript sequences
transcript_fasta_fname = data_dir / 'hg38_K03454.transcripts.fasta'
cmd = f"gffread -w {transcript_fasta_fname} -g {reference_fasta_fname} {reference_gtf_fname}"
print(cmd)
subprocess.run(cmd, shell=True, check=True)

In [14]:
# rsem prep
cmd = f"rsem-prepare-reference --gtf {reference_gtf_fname} {reference_fasta_fname} {data_dir / 'hg38_K03454_rsem'}"
print(cmd)
subprocess.run(cmd, shell=True, check=True)

rsem-prepare-reference --gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454_rsem
rsem-extract-reference-transcripts /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454_rsem 0 /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.gtf None 0 /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.fasta




Parsed 200000 lines
Parsed 400000 lines
Parsed 600000 lines
Parsed 800000 lines
Parsed 1000000 lines
Parsed 1200000 lines
Parsed 1400000 lines
Parsed 1600000 lines
Parsed 1800000 lines
Parsed 2000000 lines
Parsed 2200000 lines
Parsed 2400000 lines
Parsed 2600000 lines
Parsing gtf File is done!
/ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.fasta is processed!
158404 transcripts are extracted.
Extracting sequences is done!
Group File is generated!
Transcript Information File is generated!
Chromosome List File is generated!
Extracted Sequences File is generated!

rsem-preref /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454_rsem.transcripts.fa 1 /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454_rsem
Refs.makeRefs finished!
Refs.saveRefs finished!
/ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454_rsem.idx.fa is generated!
/ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454_rsem.n2g.id

CompletedProcess(args='rsem-prepare-reference --gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454_rsem', returncode=0)

In [7]:
# prepare for dexseq
cmd = f"dexseq_prepare_annotation.py {reference_gtf_fname} {data_dir / 'hg38_K03454.dexseq.gtf'}"
print(cmd)
subprocess.run(cmd, shell=True, check=True)

dexseq_prepare_annotation.py /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.dexseq.gtf


/bin/sh: line 1: dexseq_prepare_annotation.py: command not found


CalledProcessError: Command 'dexseq_prepare_annotation.py /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.gtf /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/data/hg38_K03454.dexseq.gtf' returned non-zero exit status 127.