Selection of epigenetically privileged HIV-1 proviruses during treatment with panobinostat and interferon-α2a [RNA-seq] (human)

project page: https://www.ncbi.nlm.nih.gov/bioproject/982154
study: https://www.cell.com/cell/fulltext/S0092-8674(24)00105-3?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867424001053%3Fshowall%3Dtrue
biosample_results: https://www.ncbi.nlm.nih.gov/biosample?Db=biosample&DbFrom=bioproject&Cmd=Link&LinkName=bioproject_biosample&LinkReadableName=BioSample&ordinalpos=1&IdsFromResult=706759

In [2]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from pysradb import SRAweb

from Bio import SeqIO

  from tqdm.autonotebook import tqdm


In [3]:
data_dir = Path.cwd().parent.parent.parent / 'HIV_Atlas_Creation' / 'data'

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = data_dir / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

reference_fasta_fname = sequence_dir / 'K03454.fasta'
reference_gtf_fname = annotation_dir / 'K03454/K03454.vira.gtf'

base_dir = Path.cwd().parent.parent

prj_id = 'PRJNA982154'

outdir = base_dir / 'results' / prj_id
outdir.mkdir(parents=True, exist_ok=True)
prj_data_dir = outdir / 'data'
prj_data_dir.mkdir(parents=True, exist_ok=True)
prj_fastq_dir = prj_data_dir / 'fastq'
prj_fastq_dir.mkdir(parents=True, exist_ok=True)

prj_aln_dir = outdir / 'alignment'
prj_aln_dir.mkdir(parents=True, exist_ok=True)
prj_assembly_dir = outdir / 'assembly'
prj_assembly_dir.mkdir(parents=True, exist_ok=True)

hisat_idx_basename = base_dir / 'data' / 'hg38_K03454'

biosample_results_fname = prj_data_dir / 'metadata/biosample_results.tsv'

In [4]:
# load metadata
metadata_df = pd.read_csv(biosample_results_fname, sep='\t')

db = SRAweb()
batch_results = db.sra_metadata(metadata_df["sra"].tolist(), detailed=True)
metadata_df = metadata_df.merge(batch_results[["run_accession","sample_accession"]],left_on="sra",right_on="sample_accession",how="left")
metadata_df = metadata_df.drop(columns=["sample_accession"])
metadata_df.to_csv(prj_data_dir / 'metadata/metadata.tsv',sep="\t",index=False)

In [4]:
metadata_df = pd.read_csv(prj_data_dir / 'metadata/metadata.tsv', sep='\t')
metadata_df.head()

Unnamed: 0,sample,biosample,sra,geo,run_accession
0,Pt04 - Day 0 - Arm A,SAMN35688046,SRS17947352,GSM7472118,SRR24886914
1,Pt02 - Day 0 - Arm A,SAMN35688045,SRS17947353,GSM7472119,SRR24886913
2,Pt13 - Day 0 - Arm B,SAMN35688044,SRS17947372,GSM7472120,SRR24886895
3,Pt11 - Day 0 - Arm B,SAMN35688043,SRS17947370,GSM7472121,SRR24886894
4,Pt09 - Day 0 - Arm B,SAMN35688042,SRS17947373,GSM7472122,SRR24886893


In [5]:
cmd_lst_fname = prj_fastq_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        os.makedirs(prj_fastq_dir, exist_ok=True)
        cmd_dump = f"fasterq-dump {run_id} --outdir {prj_fastq_dir} --split-3"
        outFP.write(f"{cmd_dump}\n")

In [12]:
# align samples with hisat
cmd_lst_fname = prj_aln_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"hisat2 -p 25 --score-min L,0,-2 --mp 2,2 -x {hisat_idx_basename} -1 {prj_fastq_dir}/{run_id}_1.fastq -2 {prj_fastq_dir}/{run_id}_2.fastq -S {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [13]:
# align samples with hisat
cmd_lst_fname = prj_aln_dir / 'cmd_lst.sort.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"samtools sort -@ 25 -o {prj_aln_dir}/{run_id}.sorted.bam {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [None]:
# should we extract only hxb2 reads or keep everything for assembly and differential analysis?