Functional impairment of HIV-specific CD8+ T cells precedes aborted spontaneous control of viremia

project page: https://www.ncbi.nlm.nih.gov/bioproject/?term=HIV+longitudinal+RNA-Seq
study: https://www.cell.com/immunity/fulltext/S1074-7613(21)00337-X?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS107476132100337X%3Fshowall%3Dtrue
biosample_results: https://www.ncbi.nlm.nih.gov/biosample?Db=biosample&DbFrom=bioproject&Cmd=Link&LinkName=bioproject_biosample&LinkReadableName=BioSample&ordinalpos=1&IdsFromResult=706759

In [3]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from pysradb import SRAweb

from Bio import SeqIO

  from tqdm.autonotebook import tqdm


In [4]:
data_dir = Path.cwd().parent.parent.parent / 'HIV_Atlas_Creation' / 'data'

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = data_dir / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

reference_fasta_fname = sequence_dir / 'K03454.fasta'
reference_gtf_fname = annotation_dir / 'K03454/K03454.vira.gtf'

base_dir = Path.cwd().parent.parent

prj_id = 'PRJNA706759'

outdir = base_dir / 'results' / prj_id
outdir.mkdir(parents=True, exist_ok=True)
prj_data_dir = outdir / 'data'
prj_data_dir.mkdir(parents=True, exist_ok=True)
prj_fastq_dir = prj_data_dir / 'fastq'
prj_fastq_dir.mkdir(parents=True, exist_ok=True)

prj_aln_dir = outdir / 'alignment'
prj_aln_dir.mkdir(parents=True, exist_ok=True)
prj_assembly_dir = outdir / 'assembly'
prj_assembly_dir.mkdir(parents=True, exist_ok=True)
prj_quant_dir = outdir / 'quantification'
prj_quant_dir.mkdir(parents=True, exist_ok=True)

hisat_idx_basename = base_dir / 'data' / 'hg38_K03454'
reference_fasta_fname = base_dir / 'data' / 'hg38_K03454.fasta'
reference_gtf_fname = base_dir / 'data' / 'hg38_K03454.gtf'

biosample_results_fname = prj_data_dir / 'metadata/biosample_results.tsv'
sample_metadata_fname = prj_data_dir / 'metadata/sample_metadata.tsv'

In [3]:
# load metadata
biosample_results_df = pd.read_csv(biosample_results_fname, sep='\t')
sample_metadata_df = pd.read_csv(sample_metadata_fname, sep='\t')
metadata_df = biosample_results_df.merge(sample_metadata_df, left_on='sample', right_on="title")
metadata_df = metadata_df.drop(columns=['title'])

db = SRAweb()
batch_results = db.sra_metadata(metadata_df["sra"].tolist(), detailed=True)
metadata_df = metadata_df.merge(batch_results[["run_accession","sample_accession"]],left_on="sra",right_on="sample_accession",how="left")
metadata_df = metadata_df.drop(columns=["sample_accession"])
metadata_df.to_csv(prj_data_dir / 'metadata/metadata.tsv',sep="\t",index=False)

In [5]:
metadata_df = pd.read_csv(prj_data_dir / 'metadata/metadata.tsv', sep='\t')
metadata_df.head()

Unnamed: 0.1,sample,biosample,sra,geo,Unnamed: 0,Sample name,source name,organism,characteristics: PatientID,characteristics: Patient.Category,...,characteristics: Total.Reads,characteristics: Total.Align,processed data file,FileName.R1,FileName.R2,UnalignableReads,UniqueAlignReads,MultipleMapReads,MultiLocReads,run_accession
0,DC04 at T3 2,SAMN18143472,SRS8382647,GSM5136118,30,A0173.S0039,Sorted tetramer positive CD8+ lymphocytes,Homo sapiens,DC04,DC,...,56654264,179668814,,A0173.S0039_R1_001.fastq.gz,A0173.S0039_R2_001.fastq.gz,25862430,17946316,12845342,24312056,SRR13860301
1,DC03 at T1 1,SAMN18143471,SRS8382648,GSM5136119,31,A0173.S0040,Sorted tetramer positive CD8+ lymphocytes,Homo sapiens,DC03,DC,...,27435242,29278363,,A0173.S0040_R1_001.fastq.gz,A0173.S0040_R2_001.fastq.gz,21991807,2787066,2656360,4476084,SRR13860302
2,DC03 at T1 2,SAMN18143470,SRS8382649,GSM5136120,32,A0173.S0041,Sorted tetramer positive CD8+ lymphocytes,Homo sapiens,DC03,DC,...,36528743,46938417,,A0173.S0041_R1_001.fastq.gz,A0173.S0041_R2_001.fastq.gz,27570111,4985675,3972951,7225382,SRR13860303
3,DC03 at T1 3,SAMN18143469,SRS8382650,GSM5136121,33,A0173.S0042,Sorted tetramer positive CD8+ lymphocytes,Homo sapiens,DC03,DC,...,34838174,47823066,,A0173.S0042_R1_001.fastq.gz,A0173.S0042_R2_001.fastq.gz,25543489,5303330,3991353,7451740,SRR13860304
4,DC03 at T3 3,SAMN18143468,SRS8382656,GSM5136127,39,A0173.S0048,Sorted tetramer positive CD8+ lymphocytes,Homo sapiens,DC03,DC,...,27339863,27651869,,A0173.S0048_R1_001.fastq.gz,A0173.S0048_R2_001.fastq.gz,21872951,3198246,2268663,4377243,SRR13860310


In [5]:
cmd_lst_fname = prj_fastq_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        os.makedirs(prj_fastq_dir, exist_ok=True)
        cmd_dump = f"fasterq-dump {run_id} --outdir {prj_fastq_dir} --split-3"
        outFP.write(f"{cmd_dump}\n")

In [12]:
# align samples with hisat
cmd_lst_fname = prj_aln_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"hisat2 -p 25 --score-min L,0,-2 --mp 2,2 -x {hisat_idx_basename} -1 {prj_fastq_dir}/{run_id}_1.fastq -2 {prj_fastq_dir}/{run_id}_2.fastq -S {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [None]:
# align samples with hisat
cmd_lst_fname = prj_aln_dir / 'cmd_lst.sort.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"samtools sort -@ 25 -o {prj_aln_dir}/{run_id}.sorted.bam {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [6]:
# assemble with stringtie
cmd_lst_fname = prj_assembly_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"stringtie -p 25 -G {reference_gtf_fname} -o {prj_assembly_dir}/{run_id}.gtf {prj_aln_dir}/{run_id}.sorted.bam"
        outFP.write(f"{cmd}\n")

In [7]:
# run htseq-count to quantify transcript expression
cmd_lst_fname = prj_quant_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"htseq-count -n 20 -f bam -s no -t exon -i transcript_id {prj_aln_dir}/{run_id}.sorted.bam {reference_gtf_fname} > {prj_quant_dir}/{run_id}.counts"
        outFP.write(f"{cmd}\n")