project page: https://www.ncbi.nlm.nih.gov/bioproject/PRJNA287649
study: https://retrovirology.biomedcentral.com/articles/10.1186/s12977-015-0204-2
biosample_results: https://www.ncbi.nlm.nih.gov/biosample?Db=biosample&DbFrom=bioproject&Cmd=Link&LinkName=bioproject_biosample&LinkReadableName=BioSample&ordinalpos=1&IdsFromResult=287649

In [1]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from pysradb import SRAweb

from Bio import SeqIO

  from tqdm.autonotebook import tqdm


In [17]:
data_dir = Path.cwd().parent.parent.parent / 'HIV_Atlas_Creation' / 'data'

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = data_dir / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

reference_fasta_fname = sequence_dir / 'K03454.fasta'
reference_gtf_fname = annotation_dir / 'K03454/K03454.vira.gtf'

base_dir = Path.cwd().parent.parent

soft_dir = base_dir / 'soft'

prj_id = 'PRJNA287649'

outdir = base_dir / 'results' / prj_id
outdir.mkdir(parents=True, exist_ok=True)
prj_data_dir = outdir / 'data'
prj_data_dir.mkdir(parents=True, exist_ok=True)
prj_fastq_dir = prj_data_dir / 'fastq'
prj_fastq_dir.mkdir(parents=True, exist_ok=True)

prj_aln_dir = outdir / 'alignment'
prj_aln_dir.mkdir(parents=True, exist_ok=True)
prj_assembly_dir = outdir / 'assembly'
prj_assembly_dir.mkdir(parents=True, exist_ok=True)
prj_quant_dir = outdir / 'quantification'
prj_quant_dir.mkdir(parents=True, exist_ok=True)

rsem_idx_basename = base_dir / 'data' / 'hg38_K03454_rsem'
mudskipper_idx_basename = base_dir / 'data' / 'hg38_K03454.mudskipper'
hisat_idx_basename = base_dir / 'data' / 'hg38_K03454'
reference_fasta_fname = base_dir / 'data' / 'hg38_K03454.fasta'
reference_gtf_fname = base_dir / 'data' / 'hg38_K03454.gtf'
t2g_fname = base_dir / 'data' / 'hg38_K03454.t2g.tsv'

biosample_results_fname = prj_data_dir / 'metadata/biosample_results.tsv'

In [3]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, str(soft_dir / "genomic_scripts"))
%aimport definitions

In [6]:
# load metadata
metadata_df = pd.read_csv(biosample_results_fname, sep='\t')

db = SRAweb()
batch_results = db.sra_metadata(metadata_df["sra"].tolist(), detailed=True)
metadata_df = metadata_df.merge(batch_results[["run_accession","sample_accession"]],left_on="sra",right_on="sample_accession",how="left")
metadata_df = metadata_df.drop(columns=["sample_accession"])
metadata_df.head()

Unnamed: 0,sample,biosample,sra,geo,run_accession
0,Donor 36 CD14+ mono,SAMN03785028,SRS967589,GSM1717162,SRR2072610
1,Donor 36 SLAN DC,SAMN03785027,SRS967590,GSM1717161,SRR2072609
2,Donor 36 CD1c+ mDC,SAMN03785026,SRS967591,GSM1717160,SRR2072608
3,Donor 36 pDC,SAMN03785025,SRS967593,GSM1717159,SRR2072607
4,Donor 28 CD14+ Mono,SAMN03785024,SRS967592,GSM1717158,SRR2072606


In [None]:
# extract column with experimental information
metadata_df["cell_cat"] = metadata_df["sample"].str.split(" ",n=2,expand=True)[2]
metadata_df["donor_cat"] = metadata_df["sample"].str.split(' ').str[:2].str.join(' ')
metadata_df.to_csv(prj_data_dir / 'metadata/metadata.tsv',sep="\t",index=False)

pheno_df = metadata_df[["run_accession","donor_cat","cell_cat"]]
pheno_df.columns = ["id","donor_cat","cell_cat"]
pheno_df.to_csv(prj_data_dir / 'metadata/pheno.tsv',sep="\t",index=False)

metadata_df.head()

In [4]:
pheno_fname = prj_data_dir / 'metadata/pheno.tsv'
metadata_df = pd.read_csv(prj_data_dir / 'metadata/metadata.tsv', sep='\t')
metadata_df.head()

Unnamed: 0,sample,biosample,sra,geo,run_accession,cell_cat,donor_cat
0,Donor 36 CD14+ mono,SAMN03785028,SRS967589,GSM1717162,SRR2072610,CD14+ mono,Donor 36
1,Donor 36 SLAN DC,SAMN03785027,SRS967590,GSM1717161,SRR2072609,SLAN DC,Donor 36
2,Donor 36 CD1c+ mDC,SAMN03785026,SRS967591,GSM1717160,SRR2072608,CD1c+ mDC,Donor 36
3,Donor 36 pDC,SAMN03785025,SRS967593,GSM1717159,SRR2072607,pDC,Donor 36
4,Donor 28 CD14+ Mono,SAMN03785024,SRS967592,GSM1717158,SRR2072606,CD14+ Mono,Donor 28


In [5]:
cmd_lst_fname = prj_fastq_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        os.makedirs(prj_fastq_dir, exist_ok=True)
        cmd_dump = f"fasterq-dump {run_id} --outdir {prj_fastq_dir} --split-3"
        outFP.write(f"{cmd_dump}\n")

In [12]:
# align samples with hisat
cmd_lst_fname = prj_aln_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"hisat2 -p 25 --score-min L,0,-2 --mp 2,2 -x {hisat_idx_basename} -U {prj_fastq_dir}/{run_id}.fastq -S {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [13]:
# sort reads by position
cmd_lst_fname = prj_aln_dir / 'cmd_lst.sort.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"samtools sort -@ 25 -o {prj_aln_dir}/{run_id}.sorted.bam {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [18]:
# convert to transcriptomic
cmd_lst_fname = prj_aln_dir / 'cmd_lst.mudskipper.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"{soft_dir / 'mudskipper/target/release/mudskipper'} bulk --shuffle --threads 25 --index {mudskipper_idx_basename} --alignment {prj_aln_dir}/{run_id}.sorted.bam --out {prj_aln_dir}/{run_id}.transcriptome.bam"
        outFP.write(f"{cmd}\n")

In [4]:
# assemble with stringtie
cmd_lst_fname = prj_assembly_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"stringtie -p 25 -G {reference_gtf_fname} -o {prj_assembly_dir}/{run_id}.gtf {prj_aln_dir}/{run_id}.sorted.bam"
        outFP.write(f"{cmd}\n")

In [None]:
# quantify with salmon
cmd_lst_fname = prj_quant_dir / 'cmd_lst.salmon.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"salmon quant -i {salmon_idx_basename} -l A -1 {prj_fastq_dir}/{run_id}_1.fastq -2 {prj_fastq_dir}/{run_id}_2.fastq -p 25 -o {prj_quant_dir}/{run_id}"
        outFP.write(f"{cmd}\n")

In [None]:
# generate counts matrix by merging counts files for all samples
counts_fname = prj_quant_dir / 'counts_matrix.tsv'
counts_df = pd.DataFrame(columns=['transcript_id'])
for run_id in metadata_df["run_accession"].tolist():
    if not (prj_quant_dir / f"{run_id}.counts").exists():
        continue
    tmp_df = pd.read_csv(prj_quant_dir / f"{run_id}.counts", sep='\t', header=None, names=['transcript_id', run_id])
    if len(tmp_df) == 0:
        continue
    counts_df = pd.merge(counts_df, tmp_df, on='transcript_id', how='outer')
counts_df.to_csv(counts_fname, sep='\t', index=False)

# create normalized matrix with cpm values
cpm_fname = prj_quant_dir / 'cpm_matrix.tsv'
cpm_df = counts_df.copy()
cpm_df = cpm_df.set_index('transcript_id')
cpm_df = cpm_df.div(cpm_df.sum(axis=0), axis=1) * 1e6
cpm_df.reset_index(inplace=True)
cpm_df.to_csv(cpm_fname, sep='\t', index=False)
cpm_df.head()

Unnamed: 0,transcript_id,SRR2072610,SRR2072609,SRR2072608,SRR2072607,SRR2072606,SRR2072605
0,CHS.1.1,0,0,0,0,0,0
1,CHS.100.1,0,0,0,0,0,0
2,CHS.100.2,0,0,0,0,0,0
3,CHS.100.3,0,0,0,0,0,0
4,CHS.100.8,0,0,0,0,0,0


In [None]:
# run spit
spit_dir = prj_quant_dir / 'spit'
spit_dir.mkdir(parents=True, exist_ok=True)

cmd = f"spit preprocess -i {cpm_fname} -l {pheno_fname} -m {t2g_fname} -O {spit_dir} --write"
print(cmd)
subprocess.run(cmd, shell=True)

cmd = f"spit dtu -i {cpm_fname} -l {pheno_fname} -m {t2g_fname} -O {spit_dir} --n_iter 10 --plot"
print(cmd)
subprocess.run(cmd, shell=True)