https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE244468
project page: https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1023231
study: https://www.nature.com/articles/s41594-024-01468-3#Sec2
biosample_results: https://www.ncbi.nlm.nih.gov/biosample?LinkName=bioproject_biosample_all&from_uid=1023231

In [None]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy.stats import f_oneway

import seaborn as sns
import matplotlib.pyplot as plt

from pysradb import SRAweb

from Bio import SeqIO

In [2]:
data_dir = Path.cwd().parent.parent.parent / 'HIV_Atlas_Creation' / 'data'

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = data_dir / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

base_dir = Path.cwd().parent.parent

soft_dir = base_dir / 'soft'

prj_id = 'PRJNA1023231'

outdir = base_dir / 'results' / prj_id
outdir.mkdir(parents=True, exist_ok=True)
prj_data_dir = outdir / 'data'
prj_data_dir.mkdir(parents=True, exist_ok=True)
prj_fastq_dir = prj_data_dir / 'fastq'
prj_fastq_dir.mkdir(parents=True, exist_ok=True)

prj_aln_dir = outdir / 'alignment'
prj_aln_dir.mkdir(parents=True, exist_ok=True)
prj_assembly_dir = outdir / 'assembly'
prj_assembly_dir.mkdir(parents=True, exist_ok=True)
prj_quant_dir = outdir / 'quantification'
prj_quant_dir.mkdir(parents=True, exist_ok=True)

hisat_idx_basename = base_dir / 'data' / 'NL43/reference'
reference_fasta_fname = base_dir / 'data' / 'NL43/reference.fasta'
reference_gtf_fname = base_dir / 'data' / 'NL43/reference.gtf'
t2g_fname = base_dir / 'data' / 'NL43/reference.t2g.tsv'
hiv_accid = "AF324493.2"

biosample_results_fname = prj_data_dir / 'metadata/biosample_results.tsv'

sashimi_bin = "sashimi.py"

In [3]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, str(soft_dir / "genomic_scripts"))
%aimport definitions

In [4]:
# load metadata
metadata_df = pd.read_csv(biosample_results_fname, sep='\t')

db = SRAweb()
batch_results = db.sra_metadata(metadata_df["sra"].tolist(), detailed=True)
metadata_df = metadata_df.merge(batch_results[["run_accession","sample_accession"]],left_on="sra",right_on="sample_accession",how="left")
metadata_df = metadata_df.drop(columns=["sample_accession"])
metadata_df.head()

Unnamed: 0,sample,biosample,sra,geo,run_accession
0,"SupT1 mock, RiboSeq, cyclo, 0hpi, rep1",SAMN37652043,SRS19047042,GSM7816472,SRR26261550
1,"SupT1 mock, RiboSeq, cyclo, 0hpi, rep2",SAMN37652042,SRS19047046,GSM7816473,SRR26261541
2,"SupT1 infected, RiboSeq, cyclo, 8hpi, rep1",SAMN37652041,SRS19047047,GSM7816474,SRR26261540
3,"SupT1 infected, RiboSeq, cyclo, 8hpi, rep2",SAMN37652040,SRS19047048,GSM7816475,SRR26261539
4,"SupT1 infected, RiboSeq, cyclo, 16hpi, rep1",SAMN37652039,SRS19048965,GSM7816476,SRR26261538


In [5]:
# extract column with experimental information
metadata_df["sequencing_cat"] = metadata_df["sample"].str.split(",",n=2,expand=True)[1].str.strip()
metadata_df["treatment_cat"] = np.where(metadata_df["sequencing_cat"] == "RNASeq",
                                            "-",
                                            metadata_df["sample"].str.split(",",expand=True)[2].str.strip()
                                        )
metadata_df["timepoint_cat"] = np.where(metadata_df["sequencing_cat"] == "RNASeq",
                                            metadata_df["sample"].str.split(",",expand=True)[2].str.strip(),
                                            metadata_df["sample"].str.split(",",expand=True)[3].str.strip()
                                        )
metadata_df["rep_cat"] = np.where(metadata_df["sequencing_cat"] == "RNASeq",
                                    metadata_df["sample"].str.split(",",expand=True)[3].str.strip(),
                                    metadata_df["sample"].str.split(",",expand=True)[4].str.strip()
                                )
metadata_df.to_csv(prj_data_dir / 'metadata/metadata.tsv',sep="\t",index=False)

pheno_df = metadata_df[["run_accession","sequencing_cat","treatment_cat","timepoint_cat","rep_cat"]]
pheno_df.columns = ["id","sequencing_cat","treatment_cat","timepoint_cat","rep_cat"]
pheno_df.to_csv(prj_data_dir / 'metadata/pheno.tsv',sep="\t",index=False)

metadata_df.head()

Unnamed: 0,sample,biosample,sra,geo,run_accession,sequencing_cat,treatment_cat,timepoint_cat,rep_cat
0,"SupT1 mock, RiboSeq, cyclo, 0hpi, rep1",SAMN37652043,SRS19047042,GSM7816472,SRR26261550,RiboSeq,cyclo,0hpi,rep1
1,"SupT1 mock, RiboSeq, cyclo, 0hpi, rep2",SAMN37652042,SRS19047046,GSM7816473,SRR26261541,RiboSeq,cyclo,0hpi,rep2
2,"SupT1 infected, RiboSeq, cyclo, 8hpi, rep1",SAMN37652041,SRS19047047,GSM7816474,SRR26261540,RiboSeq,cyclo,8hpi,rep1
3,"SupT1 infected, RiboSeq, cyclo, 8hpi, rep2",SAMN37652040,SRS19047048,GSM7816475,SRR26261539,RiboSeq,cyclo,8hpi,rep2
4,"SupT1 infected, RiboSeq, cyclo, 16hpi, rep1",SAMN37652039,SRS19048965,GSM7816476,SRR26261538,RiboSeq,cyclo,16hpi,rep1


In [4]:
pheno_fname = prj_data_dir / 'metadata/pheno.tsv'
metadata_df = pd.read_csv(prj_data_dir / 'metadata/metadata.tsv', sep='\t')
metadata_df.head()

Unnamed: 0,sample,biosample,sra,geo,run_accession,sequencing_cat,treatment_cat,timepoint_cat,rep_cat
0,"SupT1 mock, RiboSeq, cyclo, 0hpi, rep1",SAMN37652043,SRS19047042,GSM7816472,SRR26261550,RiboSeq,cyclo,0hpi,rep1
1,"SupT1 mock, RiboSeq, cyclo, 0hpi, rep2",SAMN37652042,SRS19047046,GSM7816473,SRR26261541,RiboSeq,cyclo,0hpi,rep2
2,"SupT1 infected, RiboSeq, cyclo, 8hpi, rep1",SAMN37652041,SRS19047047,GSM7816474,SRR26261540,RiboSeq,cyclo,8hpi,rep1
3,"SupT1 infected, RiboSeq, cyclo, 8hpi, rep2",SAMN37652040,SRS19047048,GSM7816475,SRR26261539,RiboSeq,cyclo,8hpi,rep2
4,"SupT1 infected, RiboSeq, cyclo, 16hpi, rep1",SAMN37652039,SRS19048965,GSM7816476,SRR26261538,RiboSeq,cyclo,16hpi,rep1


In [5]:
# we have the following features: sequencing_cat	treatment_cat	timepoint_cat
# need to setup groups of samples to compare
# instead of hierarchies - we'll just slice the dataframe and give it names and store in dict
cmp_groups = {
    ("RiboSeq","RNASeq"):{
        "RiboSeq":metadata_df[(metadata_df["sequencing_cat"] == "RiboSeq")]["run_accession"].tolist(),
        "RNASeq":metadata_df[(metadata_df["sequencing_cat"] == "RNASeq")]["run_accession"].tolist()
    },
    ("cyclo","harr"): {
        "cyclo":metadata_df[(metadata_df["treatment_cat"] == "cyclo")]["run_accession"].tolist(),
        "harr":metadata_df[(metadata_df["treatment_cat"] == "harr")]["run_accession"].tolist()
    },
    ("0","8","16","24"): {
        "0":metadata_df[(metadata_df["timepoint_cat"] == "0hpi")]["run_accession"].tolist(),
        "8":metadata_df[(metadata_df["timepoint_cat"] == "8hpi")]["run_accession"].tolist(),
        "16":metadata_df[(metadata_df["timepoint_cat"] == "16hpi")]["run_accession"].tolist(),
        "24":metadata_df[(metadata_df["timepoint_cat"] == "24hpi")]["run_accession"].tolist()
    },
    ("RNASeq_0","RNASeq_8","RNASeq_16","RNASeq_24"): {
        "RNASeq_0":metadata_df[(metadata_df["timepoint_cat"] == "0hpi") & (metadata_df["sequencing_cat"]=="RNASeq")]["run_accession"].tolist(),
        "RNASeq_8":metadata_df[(metadata_df["timepoint_cat"] == "8hpi") & (metadata_df["sequencing_cat"]=="RNASeq")]["run_accession"].tolist(),
        "RNASeq_16":metadata_df[(metadata_df["timepoint_cat"] == "16hpi") & (metadata_df["sequencing_cat"]=="RNASeq")]["run_accession"].tolist(),
        "RNASeq_24":metadata_df[(metadata_df["timepoint_cat"] == "24hpi") & (metadata_df["sequencing_cat"]=="RNASeq")]["run_accession"].tolist()
    },
    ("RiboSeq_0","RiboSeq_8","RiboSeq_16","RiboSeq_24"): {
        "RiboSeq_0":metadata_df[(metadata_df["timepoint_cat"] == "0hpi") & (metadata_df["sequencing_cat"]=="RiboSeq")]["run_accession"].tolist(),
        "RiboSeq_8":metadata_df[(metadata_df["timepoint_cat"] == "8hpi") & (metadata_df["sequencing_cat"]=="RiboSeq")]["run_accession"].tolist(),
        "RiboSeq_16":metadata_df[(metadata_df["timepoint_cat"] == "16hpi") & (metadata_df["sequencing_cat"]=="RiboSeq")]["run_accession"].tolist(),
        "RiboSeq_24":metadata_df[(metadata_df["timepoint_cat"] == "24hpi") & (metadata_df["sequencing_cat"]=="RiboSeq")]["run_accession"].tolist()
    },
    ("RiboSeq_cyclo_0","RiboSeq_cyclo_8","RiboSeq_cyclo_16","RiboSeq_cyclo_24"): {
        "RiboSeq_cyclo_0":metadata_df[(metadata_df["timepoint_cat"] == "0hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="cyclo")]["run_accession"].tolist(),
        "RiboSeq_cyclo_8":metadata_df[(metadata_df["timepoint_cat"] == "8hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="cyclo")]["run_accession"].tolist(),
        "RiboSeq_cyclo_16":metadata_df[(metadata_df["timepoint_cat"] == "16hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="cyclo")]["run_accession"].tolist(),
        "RiboSeq_cyclo_24":metadata_df[(metadata_df["timepoint_cat"] == "24hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="cyclo")]["run_accession"].tolist()
    },
    ("RiboSeq_harr_0","RiboSeq_harr_8","RiboSeq_harr_16","RiboSeq_harr_24"): {
        "RiboSeq_harr_0":metadata_df[(metadata_df["timepoint_cat"] == "0hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="harr")]["run_accession"].tolist(),
        "RiboSeq_harr_8":metadata_df[(metadata_df["timepoint_cat"] == "8hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="harr")]["run_accession"].tolist(),
        "RiboSeq_harr_16":metadata_df[(metadata_df["timepoint_cat"] == "16hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="harr")]["run_accession"].tolist(),
        "RiboSeq_harr_24":metadata_df[(metadata_df["timepoint_cat"] == "24hpi") & (metadata_df["sequencing_cat"]=="RiboSeq") & (metadata_df["treatment_cat"]=="harr")]["run_accession"].tolist()
    }
}

In [None]:
cmd_lst_fname = prj_fastq_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        os.makedirs(prj_fastq_dir, exist_ok=True)
        cmd_dump = f"fasterq-dump {run_id} --outdir {prj_fastq_dir} --split-3"
        outFP.write(f"{cmd_dump}\n")

In [7]:
# align samples with hisat
cmd_lst_fname = prj_aln_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"hisat2 -p 25 --score-min L,0,-2 --mp 2,2 -x {hisat_idx_basename} -U {prj_fastq_dir}/{run_id}.fastq -S {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [12]:
# sort reads by position
cmd_lst_fname = prj_aln_dir / 'cmd_lst.sort.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"samtools sort -@ 25 -o {prj_aln_dir}/{run_id}.sorted.bam {prj_aln_dir}/{run_id}.sam && samtools index {prj_aln_dir}/{run_id}.sorted.bam"
        outFP.write(f"{cmd}\n")

In [None]:
# extract viral reads
cmd_lst_fname = prj_aln_dir / 'cmd_lst.extract.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"samtools view -h {prj_aln_dir}/{run_id}.sorted.bam {hiv_accid} | samtools sort -o {prj_aln_dir}/{run_id}.{hiv_accid}.sorted.bam - && samtools index {prj_aln_dir}/{run_id}.{hiv_accid}.sorted.bam"
        outFP.write(f"{cmd}\n")

In [16]:
# assemble with stringtie
cmd_lst_fname = prj_assembly_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["run_accession"].tolist():
        cmd = f"stringtie -p 64 -G {reference_gtf_fname} -o {prj_assembly_dir}/{run_id}.gtf {prj_aln_dir}/{run_id}.sorted.bam"
        outFP.write(f"{cmd}\n")

In [None]:
# prepare tiebrush and gffcompare aggreagations
tb_cmd_lst_fname = prj_aln_dir / 'tiebrush_cmd_lst.txt'
tiecov_cmd_lst_fname = prj_aln_dir / 'tiecov_cmd_lst.txt'
gffcmp_cmd_lst_fname = prj_assembly_dir / 'gffcmp_cmd_lst.txt'
with open(tb_cmd_lst_fname, 'w') as tb_cmd_outFP, open(gffcmp_cmd_lst_fname, 'w') as gffcmp_cmd_outFP, open(tiecov_cmd_lst_fname, 'w') as tiecov_cmd_outFP:
    for grp_name, grp_data in cmp_groups.items():
        all_name = ".".join(grp_name)
        # tb
        all_bam_fname = prj_aln_dir / f'{all_name}.{hiv_accid}.bam'
        all_bam_lst_fname = prj_aln_dir / f'{all_name}.lst'
        # tiecov
        all_junctions_fname = prj_aln_dir / f'{all_name}.{hiv_accid}.junctions.bed'
        all_cov_fname = prj_aln_dir / f'{all_name}.{hiv_accid}.coverage.bedgraph'
        # gffcompare
        all_gtf_fname = prj_assembly_dir / f'{all_name}.gtf'
        all_gtf_lst_fname = prj_assembly_dir / f'{all_name}.lst'
        with open(all_bam_lst_fname, 'w') as all_outFP, open(all_gtf_lst_fname, 'w') as all_gtf_outFP:
            for exp_name, run_ids in grp_data.items():
                exp_bam_lst_fname = prj_aln_dir / f'{exp_name}.lst'
                exp_gtf_lst_fname = prj_assembly_dir / f'{exp_name}.lst'
                # tb
                exp_bam_fname = prj_aln_dir / f'{exp_name}.{hiv_accid}.bam'
                # tiecov
                exp_junctions_fname = prj_aln_dir / f'{exp_name}.{hiv_accid}.junctions.bed'
                exp_cov_fname = prj_aln_dir / f'{exp_name}.{hiv_accid}.coverage.bedgraph'
                # gffcompare
                exp_gtf_fname = prj_assembly_dir / f'{exp_name}.gtf'
                all_outFP.write(f"{exp_bam_fname}\n")
                all_gtf_outFP.write(f"{exp_gtf_fname}\n")
                with open(exp_bam_lst_fname, 'w') as exp_outFP, open(exp_gtf_lst_fname, 'w') as exp_gtf_outFP:
                    for run_id in run_ids:
                        exp_outFP.write(f"{prj_aln_dir}/{run_id}.{hiv_accid}.sorted.bam\n")
                        exp_gtf_outFP.write(f"{prj_assembly_dir}/{run_id}.gtf\n")
                # tb
                tb_cmd = f"tiebrush -o {exp_bam_fname} {exp_bam_lst_fname}"
                tb_cmd_outFP.write(f"{tb_cmd}\n")
                # tiecov
                tiecov_cmd = f"tiecov -c {exp_cov_fname} -j {exp_junctions_fname} {exp_bam_fname}"
                tiecov_cmd_outFP.write(f"{tiecov_cmd}\n")
                # gffcompare
                gffcmp_cmd = f"gffcompare -r {reference_gtf_fname} -o {exp_gtf_fname} -i {exp_gtf_lst_fname}"
                gffcmp_cmd_outFP.write(f"{gffcmp_cmd}\n")
        # tb
        tb_cmd = f"tiebrush -o {all_bam_fname} {all_bam_lst_fname}"
        tb_cmd_outFP.write(f"{tb_cmd}\n")
        # tiecov
        tiecov_cmd = f"tiecov -c {all_cov_fname} -j {all_junctions_fname} {all_bam_fname}"
        tiecov_cmd_outFP.write(f"{tiecov_cmd}\n")
        # gffcompare
        gffcmp_cmd = f"gffcompare -r {reference_gtf_fname} -o {all_gtf_fname} -i {all_gtf_lst_fname}"
        gffcmp_cmd_outFP.write(f"{gffcmp_cmd}\n")


In [None]:
# run gffcompare
for grp_name, grp_data in cmp_groups.items():
    all_name = ".".join(grp_name)
    all_gtf_lst_fname = prj_assembly_dir / f'{all_name}.lst'
    with open(all_gtf_lst_fname, 'w') as all_gtf_outFP:
        for exp_name, run_ids in grp_data.items():
            exp_gtf_lst_fname = prj_assembly_dir / f'{exp_name}.lst'
            
            gffcmp_exp_gtf_fname = prj_assembly_dir / f'{exp_name}.combined.gtf'
            gffcmp_exp_tracking_fname = prj_assembly_dir / f'{exp_name}.tracking'
            
            all_gtf_outFP.write(f"{gffcmp_exp_gtf_fname}\n")
            
            with open(exp_gtf_lst_fname, 'w') as exp_gtf_outFP:
                for run_id in run_ids:
                    exp_gtf_outFP.write(f"{prj_assembly_dir}/{run_id}.gtf\n")
                    
            definitions.run_gffcompare({"-r": reference_gtf_fname, "-p": f"{exp_name}", "-o": f"{prj_assembly_dir}/{exp_name}", "-i": exp_gtf_lst_fname})
            
            # subset
            # load the tids with the the seqid
            exp_gtf_df = definitions.get_attribute(gffcmp_exp_gtf_fname, ["transcript_id"],[0])
            exp_tids = exp_gtf_df[exp_gtf_df[0]==hiv_accid]["transcript_id"].tolist()
            # subset to the viral genome
            # subset gtf
            sub_gffcmp_exp_gtf_fname = prj_assembly_dir / f'{exp_name}.{hiv_accid}.gtf'
            sub_gffcmp_exp_tracking_fname = prj_assembly_dir / f'{exp_name}.{hiv_accid}.tracking'
            definitions.subset_gtf(gffcmp_exp_gtf_fname,sub_gffcmp_exp_gtf_fname,[],exp_tids)
            definitions.subset_tracking(gffcmp_exp_tracking_fname,sub_gffcmp_exp_tracking_fname,exp_tids)
            
    definitions.run_gffcompare({"-r": reference_gtf_fname, "-p": f"{all_name}", "-o": f"{prj_assembly_dir}/{all_name}", "-i": all_gtf_lst_fname})
    
    # subset
    gffcmp_all_gtf_fname = prj_assembly_dir / f'{all_name}.combined.gtf'
    gffcmp_all_tracking_fname = prj_assembly_dir / f'{all_name}.tracking'
    # load the tids with the the seqid
    all_gtf_df = definitions.get_attribute(gffcmp_all_gtf_fname, ["transcript_id"],[0])
    all_tids = all_gtf_df[all_gtf_df[0]==hiv_accid]["transcript_id"].tolist()
    # subset to the viral genome
    # subset gtf
    sub_gffcmp_all_gtf_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.gtf'
    sub_gffcmp_all_tracking_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.tracking'
    definitions.subset_gtf(gffcmp_all_gtf_fname,sub_gffcmp_all_gtf_fname,[],all_tids)
    definitions.subset_tracking(gffcmp_all_tracking_fname,sub_gffcmp_all_tracking_fname,all_tids)

In [None]:
# run orfanage on novel transcripts
for grp_name, grp_data in cmp_groups.items():
    all_name = ".".join(grp_name)
    for exp_name, run_ids in grp_data.items():
        sub_gffcmp_exp_gtf_fname = prj_assembly_dir / f'{exp_name}.{hiv_accid}.gtf'
        orf_exp_gtf_fname = prj_assembly_dir / f'{exp_name}.{hiv_accid}.orfanage.gtf'
        cmd = f"orfanage --query {sub_gffcmp_exp_gtf_fname} --reference {reference_fasta_fname} --output {orf_exp_gtf_fname} {reference_gtf_fname}"
        print(cmd)
        subprocess.run(cmd, shell=True)

    sub_gffcmp_all_gtf_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.gtf'
    orf_all_gtf_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.orfanage.gtf'
    cmd = f"orfanage --query {sub_gffcmp_all_gtf_fname} --reference {reference_fasta_fname} --output {orf_all_gtf_fname} {reference_gtf_fname}"
    print(cmd)
    subprocess.run(cmd, shell=True)

In [25]:
# for each experiment - propagate expressions via tracking into the gtf
for grp_name, grp_data in cmp_groups.items():
    all_name = ".".join(grp_name)
    
    sub_gffcmp_all_tracking_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.tracking'
    hierarchy = {
        (all_name,sub_gffcmp_all_tracking_fname):{}
    }
    
    for exp_name, run_ids in grp_data.items():
        sub_gffcmp_exp_tracking_fname = prj_assembly_dir / f'{exp_name}.{hiv_accid}.tracking'
        hierarchy[(all_name,sub_gffcmp_all_tracking_fname)][(exp_name,sub_gffcmp_exp_tracking_fname)] = {}
        
    orf_all_gtf_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.orfanage.gtf'
    augmented_orf_all_gtf_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.orfanage.tracking.gtf'
    definitions.combine_tracking_gtf(orf_all_gtf_fname, hierarchy, augmented_orf_all_gtf_fname)

In [None]:
# load and process stringtie results
strg_tdf = pd.DataFrame()
def load_and_process_stringtie(gtf_fname, all_name, experiments, num_significant_transcripts, out_basename):
    global strg_tdf
    # experiments is a dict of name to number of samples
    # load a table for each assembled transcript with the data
    columns = ["class_code", f"{all_name}_tpm_mean", f"{all_name}_num_samples"] + \
              [f"{exp_name}_tpm_mean" for exp_name in experiments] + \
              [f"{exp_name}_num_samples" for exp_name in experiments]
    exp_df = definitions.get_attribute(gtf_fname, columns)
    
    # Replace "-" with 0
    exp_df.replace("-", 0, inplace=True)
    
    # Convert to numeric
    num_cols = exp_df.columns.difference(['tid', 'class_code'])
    exp_df[num_cols] = exp_df[num_cols].astype(float)
    exp_df.sort_values(by=f"{all_name}_num_samples", ascending=False, inplace=True)

    # Compute percent samples
    exp_df[f"{all_name}_percent_samples"] = exp_df[f"{all_name}_num_samples"] / metadata_df.shape[0]
    for exp_name, run_ids in experiments.items():
        exp_df[f"{exp_name}_percent_samples"] = exp_df[f"{exp_name}_num_samples"] / len(run_ids)

    # Compute percentage sample difference for each experiment
    for exp_name in experiments.keys():
        # Sum percent_samples of all other experiments
        other_exps_percent_samples = sum(
            exp_df[f"{other_exp}_percent_samples"] 
            for other_exp in experiments.keys() if other_exp != exp_name
        )
        
        # Compute absolute difference
        exp_df[f"{exp_name}_perc_samples_diff"] = abs(
            exp_df[f"{exp_name}_percent_samples"] - other_exps_percent_samples
        )
        
        # Bin the percentage sample difference
        exp_df[f"{exp_name}_perc_samples_diff_bin"] = pd.cut(
            exp_df[f"{exp_name}_perc_samples_diff"], bins=10, labels=False
        )

    # isolate novel
    novel_exp_df = exp_df[~(exp_df["class_code"]=="=")].reset_index(drop=True)
    # total number of novel transcripts
    print(f"{novel_exp_df.shape[0]} novel transcripts detected in the dataset: {all_name}")

    # create subsets of most promising novel transcripts
    tids = novel_exp_df[novel_exp_df[f"{all_name}_percent_samples"]>0.1]["tid"].tolist()
    print(f"number of novel transcripts with >10% of samples in {all_name}: {len(tids)}")
    novel_gtf_fname = out_basename+".top_novel_num_samples.gtf"
    definitions.subset_gtf(gtf_fname,novel_gtf_fname,[],tids)
    
    # save exp_df
    exp_df.to_csv(out_basename+".exp_df.tsv",sep="\t",index=False)
    
    # use stringtie results to load up transcript quantifications 
    strg_tdf = pd.DataFrame()
    for exp_name, run_ids in experiments.items():
        for run_id in run_ids:
            if not (prj_assembly_dir / f"{run_id}.gtf").exists():
                continue
        
            tdf = definitions.get_chains(prj_assembly_dir / f"{run_id}.gtf","exon",True)
            tmp_df = definitions.get_attribute(prj_assembly_dir / f"{run_id}.gtf",["TPM","reference_id"])
            tdf = tdf.merge(tmp_df,on="tid")
            tdf["run_accession"] = run_id
            tdf["sample"] = exp_name
            strg_tdf = pd.concat([strg_tdf, tdf])

    strg_tdf = strg_tdf[(strg_tdf["seqid"]==hiv_accid)].reset_index(drop=True)
    strg_tdf["TPM"] = pd.to_numeric(strg_tdf["TPM"], errors='coerce')
    strg_tdf.to_csv(out_basename + ".transcript_tpm.tsv",sep="\t",index=False)
    
    # Pivot table for statistical analysis
    pivot_df = strg_tdf.pivot_table(index='reference_id', columns=['sample',"run_accession"], values='TPM')
    # Perform ANOVA to identify significant transcripts
    anova_results = []
    for tid, row in pivot_df.iterrows():
        groups = [row.filter(like=exp).dropna() for exp in experiments.keys()]
        
        # Ensure each group has more than one value (replicates)
        if all(len(group) > 1 for group in groups):
            stat, pval = f_oneway(*groups)  # Perform ANOVA
            anova_results.append((tid, pval))

    anova_df = pd.DataFrame(anova_results, columns=['reference_id', 'pval'])
    anova_df['significant'] = anova_df['pval'] < 0.05

    print(f"Number of significant transcripts: {anova_df['significant'].sum()}")
    anova_df.to_csv(out_basename + ".anova.tsv",sep="\t",index=False)
    
    significant_tids = anova_df[anova_df['significant']]['reference_id'].tolist()
    top_transcripts = significant_tids[:num_significant_transcripts]

    # setup the figure of split violin plots for significant tids
    fig, axes = plt.subplots(1, num_significant_transcripts, figsize=(4*num_significant_transcripts, 6), sharey=False)
    for i, tid in enumerate(top_transcripts):
        transcript_data = strg_tdf[strg_tdf['reference_id'] == tid]
        ax = axes[i] if num_significant_transcripts > 1 else axes
        sns.violinplot(
            data=transcript_data,
            x='reference_id',
            y='TPM',
            hue='sample',
            split=True,
            inner='quart',
            fill=False,
            ax=ax
        )
        
        ax.set_title(f"{tid}")
        ax.set_xlabel("") 
        if i > 0:
            ax.set_ylabel("")
        if i < num_significant_transcripts - 1:
            ax.get_legend().remove()
    plt.tight_layout()
    plt.savefig(out_basename + ".violin_plot.png")
    
    # write a list of tiecov coverage and junction files and names of monkeys for the cov/sj/tn params of sashimi
    cov_lst_fname = out_basename + '.cov_lst.txt'
    with open(cov_lst_fname, 'w') as outFP:
        for exp_name, run_ids in experiments.items():
            outFP.write(f"{prj_aln_dir}/{exp_name}.{hiv_accid}.coverage.bedgraph\n")
    sj_lst_fname = out_basename + '.sj_lst.txt'
    with open(sj_lst_fname, 'w') as outFP:
        for exp_name, run_ids in experiments.items():
            outFP.write(f"{prj_aln_dir}/{exp_name}.{hiv_accid}.junctions.bed\n")
    tn_lst_fname = out_basename + '.tn_lst.txt'
    with open(tn_lst_fname, 'w') as outFP:
        for exp_name, run_ids in experiments.items():
            outFP.write(f"{exp_name}\n")
            
    # build sashimi plots for the significant transcripts
    # we want to compare tiebrush between T034 and Ay69 for a given transcript only

    viral_tids = [x for x in strg_tdf['reference_id'].tolist() if x != "-"]
    viral_gtf_fname = out_basename + '.viral.gtf'
    definitions.subset_gtf(reference_gtf_fname,viral_gtf_fname,[],viral_tids)
    # build for entire transcriptome
    sashimi_svg_fname = out_basename + '.svg'
    sashimi_cmd = [sashimi_bin,
                    "--title",tid,
                    "--gtf",str(viral_gtf_fname),
                    "-o",str(sashimi_svg_fname),
                    "--normalize",
                    "--subtract","0",
                    "--intron_scale","1",
                    "--exon_scale","1",
                    "--tn",str(tn_lst_fname),
                    "--cov",str(cov_lst_fname),
                    "--sj",str(sj_lst_fname)]
    subprocess.call(sashimi_cmd)

    # SASHIMI WITHOUT NORMALIZATION
    sashimi_svg_fname = out_basename + '.non_normalized.svg'

    sashimi_cmd = [sashimi_bin,
                    "--title",tid,
                    "--gtf",str(viral_gtf_fname),
                    "-o",str(sashimi_svg_fname),
                    "--intron_scale","1",
                    "--exon_scale","1",
                    "--tn",str(tn_lst_fname),
                    "--cov",str(cov_lst_fname),
                    "--sj",str(sj_lst_fname),
                    "--subtract","0"]
    subprocess.call(sashimi_cmd)

    # now build transcript-specific plots
    for tid in set(strg_tdf['reference_id'].tolist()):
        if tid == "-": # skip non-reference transcripts
            continue
        try:
            tx_gtf_fname = out_basename + f'.{tid}.gtf'
            tx_svg_fname = out_basename + f'.{tid}.svg'

            # extract the transcript from the gtf
            definitions.subset_gtf(reference_gtf_fname,tx_gtf_fname,False,[tid])

            # build sashimi plot
            sashimi_cmd = [sashimi_bin,
                            "--title",tid,
                            "--gtf",str(tx_gtf_fname),
                            "-o",str(tx_svg_fname),
                            "--normalize",
                            "--subtract","0",
                            "--intron_scale","1",
                            "--exon_scale","1",
                            "--tn",str(tn_lst_fname),
                            "--cov",str(cov_lst_fname),
                            "--sj",str(sj_lst_fname)]
            subprocess.call(sashimi_cmd)

            # SASHIMI WITHOUT NORMALIZATION
            tx_svg_fname = out_basename + f'{tid}.non_normalized.svg'

            sashimi_cmd = [sashimi_bin,
                            "--title",tid,
                            "--gtf",str(tx_gtf_fname),
                            "-o",str(tx_svg_fname),
                            "--intron_scale","1",
                            "--exon_scale","1",
                            "--tn",str(tn_lst_fname),
                            "--cov",str(cov_lst_fname),
                            "--sj",str(sj_lst_fname),
                            "--subtract","0"]
            subprocess.call(sashimi_cmd)
        except Exception as e:
            print(f"Error processing {tid}: {e}")
            continue

    
for grp_name, grp_data in cmp_groups.items():
    all_name = ".".join(grp_name)
    print(f"Processing group: {all_name}")
    
    out_all_name_dir = prj_quant_dir / all_name
    out_all_name_dir.mkdir(parents=True, exist_ok=True)
    
    gtf_fname = prj_assembly_dir / f'{all_name}.{hiv_accid}.orfanage.tracking.gtf'
    out_basename = str(out_all_name_dir / f'{all_name}.{hiv_accid}.orfanage.tracking')
    experiments = {exp_name: run_ids for exp_name, run_ids in grp_data.items()}
    load_and_process_stringtie(gtf_fname, all_name, experiments, 5, out_basename)

Processing group: RiboSeq.RNASeq
68 novel transcripts detected in the dataset: RiboSeq.RNASeq
number of novel transcripts with >10% of samples in RiboSeq.RNASeq: 5
