In [None]:
import os
import re
import sys
import csv
import time
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

from pysradb import SRAweb

from Bio import SeqIO

from IPython.display import SVG
from IPython.display import display

In [None]:
data_dir = Path.cwd().parent.parent.parent / 'HIV_Atlas_Creation' / 'data'
base_dir = Path.cwd().parent.parent
soft_dir = base_dir / 'soft'

prj_id = 'BoltonSIV'

outdir = base_dir / 'results' / prj_id
outdir.mkdir(parents=True, exist_ok=True)

prj_aln_dir = outdir / 'alignment'
prj_aln_dir.mkdir(parents=True, exist_ok=True)
prj_assembly_dir = outdir / 'assembly'
prj_assembly_dir.mkdir(parents=True, exist_ok=True)
prj_quant_dir = outdir / 'quantification'
prj_quant_dir.mkdir(parents=True, exist_ok=True)
prj_aux_dir = outdir / 'auxiliary'
prj_aux_dir.mkdir(parents=True, exist_ok=True)

hisat_idx_basename = base_dir / 'data/SIV239/reference'
reference_fasta_fname = base_dir / 'data/SIV239/reference.fasta'
reference_gtf_fname = base_dir / 'data/SIV239/reference.gtf'

sivPlus_info_fname = data_dir / "info_SIVplus.tsv"

sashimi_bin = "sashimi.py"

In [3]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, str(soft_dir / "genomic_scripts"))
%aimport definitions

In [4]:
metadata_df = pd.read_csv(sivPlus_info_fname, sep='\t', index_col=0).reset_index()
print(set(metadata_df["Experiment Title"].str.split(":",expand=True)[1].str.split(";",expand=True)[0].str.split(",",expand=True)[0].unique()))
metadata_df.head()

{' Memory CD4 T cell gene expression from macaque AY69 lymph node', ' Memory CD4 T cell gene expression from macaque T034 PBMC'}


Unnamed: 0,animal,sample,Cell number,SIV reads,SIV prop,SIV prop >0.01,in final filtered seurat,D1-A5 splice,tatrev splice,SIV pos,number of splice varieties seen,passed initial sc QC,Experiment Accession,Experiment Title,Sample Accession,Total Spots,Total Bases,Run
0,AY69,A1p1,1,42031,0.020604,1,1,127,0,1,1,1,SRX20459058,GSM7399872: Memory CD4 T cell gene expression ...,SRS17766270,2585301,661808315,SRR24679672
1,AY69,A2p1,1,43091,0.022958,1,1,264,841,1,2,1,SRX20458082,GSM7399879: Memory CD4 T cell gene expression ...,SRS17765295,2683173,660469952,SRR24678661
2,AY69,A6p1,1,157834,0.072799,1,1,2250,3347,1,2,1,SRX20458792,GSM7399907: Memory CD4 T cell gene expression ...,SRS17766006,2702252,664149793,SRR24679087
3,AY69,A8p1,1,123108,0.104491,1,1,1159,1089,1,2,1,SRX20458806,GSM7399921: Memory CD4 T cell gene expression ...,SRS17766018,1741331,447735007,SRR24679073
4,AY69,B12p1,1,111957,0.073472,1,1,961,1613,1,2,1,SRX20458844,GSM7399949: Memory CD4 T cell gene expression ...,SRS17766056,2032111,518745652,SRR24679461


In [None]:
# align samples with hisat
cmd_lst_fname = prj_aln_dir / 'cmd_lst.txt'
reads_dir = data_dir / "data/fastq/"
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["Run"].tolist():
        r1_fname = reads_dir+run_id+"_1.fastq"
        r2_fname = reads_dir+run_id+"_2.fastq"
        cmd = f"hisat2 -p 64 --score-min L,0,-2 --mp 2,2 -x {hisat_idx_basename} -1 {r1_fname} -2 {r2_fname} -S {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [6]:
# sort reads by position
cmd_lst_fname = prj_aln_dir / 'cmd_lst.sort.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["Run"].tolist():
        cmd = f"samtools sort -@ 64 -o {prj_aln_dir}/{run_id}.sorted.bam {prj_aln_dir}/{run_id}.sam"
        outFP.write(f"{cmd}\n")

In [5]:
# run tiebrush to generate joint representations of each monkey

# for each monkey write out a list of alignments in a .lst file
for monkey in metadata_df["animal"].unique():
    lst_fname = prj_aln_dir / f'{monkey}.lst'
    with open(lst_fname, 'w') as outFP:
        for run_id in metadata_df[metadata_df["animal"]==monkey]["Run"].tolist():
            outFP.write(f"{prj_aln_dir}/{run_id}.sorted.bam\n")

cmd_lst_fname = prj_aln_dir / 'cmd_lst.tiebrush.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for monkey in metadata_df["animal"].unique():
        lst_fname = prj_aln_dir / f'{monkey}.lst'
        cmd = f"tiebrush -o {prj_aln_dir}/{monkey}.tb.bam {lst_fname}"
        outFP.write(f"{cmd}\n")

In [6]:
# extract SIV reads only (M33262.1)
cmd_lst_fname = prj_aln_dir / 'cmd_lst.extract.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for monkey in metadata_df["animal"].unique():
        cmd = f"samtools index {prj_aln_dir}/{monkey}.tb.bam && samtools view -h {prj_aln_dir}/{monkey}.tb.bam M33262.1 | samtools sort -o {prj_aln_dir}/{monkey}.tb.M33262.1.sorted.bam -"
        outFP.write(f"{cmd}\n")
        for run_id in metadata_df[metadata_df["animal"]==monkey]["Run"].tolist():
            cmd = f"samtools index {prj_aln_dir}/{run_id}.sorted.bam && samtools view -h {prj_aln_dir}/{run_id}.sorted.bam M33262.1 | samtools sort -o {prj_aln_dir}/{run_id}.M33262.1.sorted.bam -"
            outFP.write(f"{cmd}\n")

In [7]:
# run tiecov
cmd_lst_fname = prj_aln_dir / 'cmd_lst.tiecov.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for monkey in metadata_df["animal"].unique():
        cmd = f"tiecov -c {prj_aln_dir}/{monkey}.tb.M33262.1.coverage.bedgraph -j {prj_aln_dir}/{monkey}.tb.M33262.1.junctions.bed {prj_aln_dir}/{monkey}.tb.M33262.1.sorted.bam"
        outFP.write(f"{cmd}\n")

In [8]:
# assemble with stringtie
cmd_lst_fname = prj_assembly_dir / 'cmd_lst.txt'
with open(cmd_lst_fname, 'w') as outFP:
    for run_id in metadata_df["Run"].tolist():
        cmd = f"stringtie -p 64 -G {reference_gtf_fname} -o {prj_assembly_dir}/{run_id}.gtf {prj_aln_dir}/{run_id}.sorted.bam"
        outFP.write(f"{cmd}\n")

In [None]:
# merge stringtie results to identify and quantify novel transcripts
monkey_cmp_lst = [] # results of gffcompare for each monkey to aggregate into a single gtf for downstream analysis as well
for monkey in metadata_df["animal"].unique():
    lst_fname = prj_assembly_dir / f'{monkey}.lst'
    with open(lst_fname, 'w') as out_lstFP:
        for run_id in metadata_df[metadata_df["animal"]==monkey]["Run"].tolist():
            out_lstFP.write(f"{prj_assembly_dir}/{run_id}.gtf\n")
    definitions.run_gffcompare({"-r": reference_gtf_fname, "-p": f"{monkey}", "-o": f"{prj_assembly_dir}/{monkey}", "-i": lst_fname})
    monkey_cmp_lst.append(f"{prj_assembly_dir}/{monkey}.combined.gtf")

# merge all gtf files
cmd = f"gffcompare -r {reference_gtf_fname} -p ALL -o {prj_assembly_dir}/ALL {' '.join(monkey_cmp_lst)}"
print(cmd)
subprocess.run(cmd, shell=True)

In [10]:
# create subsets of the gffcompare output for the M33262.1 genome only
for monkey in metadata_df["animal"].unique():
    gtf_fname = prj_assembly_dir / f'{monkey}.combined.gtf'
    tracking_fname = prj_assembly_dir / f'{monkey}.tracking'
    
    # load the tids with the the seqid
    gtf_df = definitions.get_attribute(gtf_fname, ["transcript_id"],[0])
    tids = gtf_df[gtf_df[0]=="M33262.1"]["transcript_id"].tolist()
    
    # subset to the M33262.1 genome
    # subset gtf
    sub_gtf_fname = prj_assembly_dir / f'{monkey}.M33262.1.gtf'
    sub_tracking_fname = prj_assembly_dir / f'{monkey}.M33262.1.tracking'
    definitions.subset_gtf(gtf_fname,sub_gtf_fname,[],tids)
    definitions.subset_tracking(tracking_fname,sub_tracking_fname,tids)
    
# subset ALL tracking and gtf
gtf_fname = prj_assembly_dir / f'ALL.combined.gtf'
tracking_fname = prj_assembly_dir / f'ALL.tracking'
# load the tids with the the seqid
gtf_df = definitions.get_attribute(gtf_fname, ["transcript_id"],[0])
tids = gtf_df[gtf_df[0]=="M33262.1"]["transcript_id"].tolist()
# subset gtf
sub_gtf_fname = prj_assembly_dir / f'ALL.M33262.1.gtf'
sub_tracking_fname = prj_assembly_dir / f'ALL.M33262.1.tracking'
definitions.subset_gtf(gtf_fname,sub_gtf_fname,[],tids)
definitions.subset_tracking(tracking_fname,sub_tracking_fname,tids)

In [None]:
# run orfanage on novel transcripts
cmd = f"orfanage --query {prj_assembly_dir}/ALL.M33262.1.gtf --reference {reference_fasta_fname} --output {prj_assembly_dir}/ALL.M33262.1.orfanage.gtf {reference_gtf_fname}"
print(cmd)
subprocess.run(cmd, shell=True)

In [12]:
# pull tracking data into the gtf
# initialize the tracking hierarchy
all_tracking_fname = prj_assembly_dir / 'ALL.M33262.1.tracking'
all_name = "ALL"
hierarchy = {
    (all_name,all_tracking_fname):{}
}
for monkey in metadata_df["animal"].unique():
    mnk_tracking_fname = prj_assembly_dir / f'{monkey}.M33262.1.tracking'
    hierarchy[(all_name,all_tracking_fname)][(monkey,mnk_tracking_fname)] = {}

definitions.combine_tracking_gtf(prj_assembly_dir / 'ALL.M33262.1.orfanage.gtf', hierarchy, prj_assembly_dir / 'ALL.M33262.1.orfanage.tracking.gtf')

In [13]:
# load a table for each assembled transcript with the data
exp_df = definitions.get_attribute(prj_assembly_dir / 'ALL.M33262.1.orfanage.tracking.gtf',["class_code","ALL_tpm_mean","ALL_num_samples","AY69_tpm_mean","AY69_num_samples","T034_tpm_mean","T034_num_samples"])
# replace - with 0
exp_df.replace("-",0,inplace=True)
# convert to numeric
num_cols = exp_df.columns.difference(['tid','class_code'])
exp_df[num_cols] = exp_df[num_cols].astype(float)
exp_df.sort_values(by="ALL_num_samples",ascending=False,inplace=True)

# compute percent samples 
exp_df["ALL_percent_samples"] = exp_df["ALL_num_samples"]/metadata_df.shape[0]
for monkey in metadata_df["animal"].unique():
    exp_df[f"{monkey}_percent_samples"] = exp_df[f"{monkey}_num_samples"]/metadata_df[metadata_df["animal"]==monkey].shape[0]

exp_df.head()

Unnamed: 0,tid,class_code,ALL_tpm_mean,ALL_num_samples,AY69_tpm_mean,AY69_num_samples,T034_tpm_mean,T034_num_samples,ALL_percent_samples,AY69_percent_samples,T034_percent_samples
69,ALL_00411239,=,982.157097,149.0,1183.173731,89.0,683.982423,60.0,1.0,1.0,1.0
72,ALL_00411254,=,371.188454,127.0,365.079713,85.0,383.551384,42.0,0.852349,0.955056,0.7
38,ALL_00411255,=,239.708965,126.0,262.067336,85.0,193.356246,41.0,0.845638,0.955056,0.683333
44,ALL_00411242,=,289.746285,125.0,214.829095,86.0,454.948295,39.0,0.838926,0.966292,0.65
53,ALL_00411243,=,138.904371,125.0,125.493695,86.0,168.47663,39.0,0.838926,0.966292,0.65


In [14]:
# let's get some summary statistics 

# compute additional information
exp_df["perc_samples_diff"] = abs(exp_df["AY69_percent_samples"]-exp_df["T034_percent_samples"])
exp_df["perc_samples_diff_bin"] = pd.cut(exp_df["perc_samples_diff"], bins=10, labels=False)

# isolate novel
novel_exp_df = exp_df[~(exp_df["class_code"]=="=")].reset_index(drop=True)
# total number of novel transcripts
print(f"{novel_exp_df.shape[0]} novel transcripts detected in the dataset")

# create subsets of most promising novel transcripts
tids = novel_exp_df[novel_exp_df["ALL_percent_samples"]>0.1]["tid"].tolist()
print(f"number of novel transcripts with >10% of samples: {len(tids)}")
definitions.subset_gtf(prj_assembly_dir / 'ALL.M33262.1.orfanage.tracking.gtf',prj_assembly_dir / 'ALL.M33262.1.orfanage.tracking.top_novel_num_samples.gtf',[],tids)

# now repeat by percent difference binned
tids = novel_exp_df.sort_values(by=["perc_samples_diff_bin","ALL_tpm_mean"],ascending=False).head(10)["tid"].tolist()
definitions.subset_gtf(prj_assembly_dir / 'ALL.M33262.1.orfanage.tracking.gtf',prj_assembly_dir / 'ALL.M33262.1.orfanage.tracking.top_novel_perc_samples_diff.gtf',[],tids)

149 novel transcripts detected in the dataset
number of novel transcripts with >10% of samples: 4


In [15]:
# rank based on the overal tpm and number of samples,
# exclusivity to either AY69 or T034
# tpm

exp_df["perc_samples_diff"] = abs(exp_df["AY69_percent_samples"]-exp_df["T034_percent_samples"])
exp_df["perc_samples_diff_bin"] = pd.cut(exp_df["perc_samples_diff"], bins=10, labels=False)
exp_df.sort_values(by=["perc_samples_diff_bin","ALL_tpm_mean"],ascending=[False,False],inplace=True)

exp_df[~(exp_df["class_code"]=="=")].head()

Unnamed: 0,tid,class_code,ALL_tpm_mean,ALL_num_samples,AY69_tpm_mean,AY69_num_samples,T034_tpm_mean,T034_num_samples,ALL_percent_samples,AY69_percent_samples,T034_percent_samples,perc_samples_diff,perc_samples_diff_bin
81,ALL_00411274,o,2310.654641,29.0,2310.654641,29.0,0.0,0.0,0.194631,0.325843,0.0,0.325843,7
51,ALL_00411275,j,507.985809,20.0,426.078315,18.0,1245.153259,2.0,0.134228,0.202247,0.033333,0.168914,3
144,ALL_00411308,j,2555.871491,11.0,2555.871491,11.0,0.0,0.0,0.073826,0.123596,0.0,0.123596,2
141,ALL_00411312,j,2283.966138,5.0,2283.966138,5.0,0.0,0.0,0.033557,0.05618,0.0,0.05618,1
85,ALL_00411289,m,1568.986935,12.0,2385.710388,4.0,1160.625208,8.0,0.080537,0.044944,0.133333,0.08839,1


In [None]:
# use stringtie results to load up transcript quantifications 
strg_tdf = pd.DataFrame()
for idx, row in metadata_df.iterrows():
    run_id = row['Run']
    sample = row['animal']
    if not (prj_assembly_dir / f"{run_id}.gtf").exists():
        continue
    
    tdf = definitions.get_chains(prj_assembly_dir / f"{run_id}.gtf","exon",True)
    tmp_df = definitions.get_attribute(prj_assembly_dir / f"{run_id}.gtf",["TPM","reference_id"])
    tdf = tdf.merge(tmp_df,on="tid")
    tdf["run_accession"] = run_id
    tdf["sample"] = sample
    strg_tdf = pd.concat([strg_tdf, tdf])

strg_tdf.to_csv(prj_quant_dir / "transcript_tpm.tsv",sep="\t",index=False)
strg_tdf.head()

In [11]:
strg_tdf = pd.read_csv(prj_quant_dir / "transcript_tpm.tsv", sep='\t')

strg_tdf = strg_tdf[(strg_tdf["seqid"]=="M33262.1")].reset_index(drop=True)

# Pivot table for statistical analysis
pivot_df = strg_tdf.pivot_table(index='reference_id', columns=['sample',"run_accession"], values='TPM')
pivot_df

sample,AY69,AY69,AY69,AY69,AY69,AY69,AY69,AY69,AY69,AY69,...,T034,T034,T034,T034,T034,T034,T034,T034,T034,T034
run_accession,SRR24678268,SRR24678497,SRR24678507,SRR24678531,SRR24678542,SRR24678543,SRR24678550,SRR24678555,SRR24678556,SRR24678567,...,SRR24679215,SRR24679230,SRR24679238,SRR24679260,SRR24679302,SRR24679320,SRR24679338,SRR24679476,SRR24679489,SRR24679514
reference_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
-,907.635468,194.610512,1182.871826,,1165.915585,2540.382538,926.502271,540.407471,1496.870127,2116.284363,...,1568.548767,1251.324797,2641.727783,1048.969259,2464.014111,1231.950305,276.303986,1112.275513,803.589813,1107.056024
env.1,120.444344,31.467482,,109.548798,136.184998,315.649078,,2.072433,1129.741821,,...,,,,,891.121277,,,,,108.138756
env.2,199.821701,146.941498,46.394848,266.173218,484.943115,512.683228,582.734497,2.795034,1003.000977,,...,278.631531,,,391.050659,1132.005493,597.666931,,,223.062332,
env.3,205.211685,216.750641,47.990814,737.295959,127.69516,426.030273,1469.890991,,1167.700195,508.186615,...,266.988007,,,745.718323,1121.547607,607.274109,,,163.125336,120.181099
env.5,,,49.857624,101.553154,122.229553,,,,347.133789,347.232422,...,266.620636,260.457031,,526.630432,1255.49939,,,,176.776688,
env.9,461.366577,188.791214,47.122849,937.056335,74.217941,68.76638,267.621185,,1128.697388,47.152004,...,160.718948,,,611.398682,900.808533,416.817322,,,1422.511353,94.019981
env/vpu.1,,,,47.29879,,,,,184.234711,,...,,,,579.184021,,,,,363.440216,
env/vpu.2,155.793671,122.814964,,86.068901,235.735687,606.179871,186.301804,,1043.277344,,...,,,,182.682007,,,,,187.249146,
gag.1,92.798599,2744.722168,852.241394,5777.602539,604.631165,397.429993,728.120117,89.112259,2193.041016,225.010208,...,1488.908569,264.517334,167.222153,2949.10791,2268.022949,3706.580322,68.628975,603.426331,279.319244,457.969025
nef.2,176.461441,77.102295,64.800613,495.615173,540.839294,66.814728,404.959351,,1067.094727,91.283966,...,376.320557,,,285.560883,1372.876709,2625.879883,,,392.864044,488.903625


In [12]:
# Perform ANOVA to identify significant transcripts
anova_results = []
for tid, row in pivot_df.iterrows():
    groups = [
        row.filter(like="AY69").dropna(),
        row.filter(like="T034").dropna()
    ]
    if all(len(group) > 1 for group in groups):  # Ensure replicates are present
        stat, pval = f_oneway(*groups)
        anova_results.append((tid, pval))

anova_df = pd.DataFrame(anova_results, columns=['reference_id', 'pval'])
anova_df['significant'] = anova_df['pval'] < 0.05
anova_df

Unnamed: 0,reference_id,pval,significant
0,-,0.744492,False
1,env.1,0.477565,False
2,env.2,0.518538,False
3,env.3,0.677672,False
4,env.5,0.062076,False
5,env.9,0.181223,False
6,env/vpu.1,0.01715,True
7,env/vpu.2,0.913453,False
8,gag.1,0.03711,True
9,nef.2,0.13762,False


In [None]:
top_significant_tids = significant_tids[:5]

# Filter data for these transcripts
top_significant_data = strg_tdf[strg_tdf['reference_id'].isin(top_significant_tids)]

# Create a new column for groups based on "sample"
top_significant_data['group'] = top_significant_data['sample'].apply(lambda x: 'AY69' if 'AY69' in x else 'T034')

# Number of transcripts to plot
num_transcripts = 5  # Adjust as needed
top_transcripts = significant_tids[:num_transcripts]

# Create a figure with subplots
fig, axes = plt.subplots(1, num_transcripts, figsize=(4*num_transcripts, 6), sharey=False)

# Plot each transcript as a separate violin plot
for i, tid in enumerate(top_transcripts):
    transcript_data = strg_tdf[strg_tdf['reference_id'] == tid]
    
    # Plot on the appropriate subplot
    ax = axes[i] if num_transcripts > 1 else axes
    
    # Create the split violin plot
    sns.violinplot(
        data=transcript_data,
        x='reference_id',  # This will be constant within each subplot
        y='TPM',
        hue='sample',
        split=True,
        inner='quart',
        fill=False,
        ax=ax
    )
    
    # Customize the subplot
    ax.set_title(f"{tid}")
    ax.set_xlabel("")  # Remove x label as it's redundant
    
    # Only keep y-label for the first plot
    if i > 0:
        ax.set_ylabel("")
    
    # Remove legend from all but the last plot
    if i < num_transcripts - 1:
        ax.get_legend().remove()

# Adjust layout
plt.tight_layout()
plt.show()

In [21]:
# write a list of tiecov coverage and junction files and names of monkeys for the cov/sj/tn params of sashimi
cov_lst_fname = prj_aux_dir / 'cov_lst.txt'
with open(cov_lst_fname, 'w') as outFP:
    for monkey in metadata_df["animal"].unique():
        outFP.write(f"{prj_aln_dir}/{monkey}.tb.M33262.1.coverage.bedgraph\n")
sj_lst_fname = prj_aux_dir / 'sj_lst.txt'
with open(sj_lst_fname, 'w') as outFP:
    for monkey in metadata_df["animal"].unique():
        outFP.write(f"{prj_aln_dir}/{monkey}.tb.M33262.1.junctions.bed\n")
tn_lst_fname = prj_aux_dir / 'tn_lst.txt'
with open(tn_lst_fname, 'w') as outFP:
    for monkey in metadata_df["animal"].unique():
        outFP.write(f"{monkey}\n")

In [None]:
# build sashimi plots for the significant transcripts
# we want to compare tiebrush between T034 and Ay69 for a given transcript only

for tid in set(strg_tdf['reference_id'].tolist()):  # Show top 5 significant transcripts
    try:
        tx_gtf_fname = prj_aux_dir / f'{tid}.gtf'
        tx_svg_fname = prj_aux_dir / f'{tid}.svg'

        # extract the transcript from the gtf
        definitions.subset_gtf(reference_gtf_fname,tx_gtf_fname,False,[tid])

        # build sashimi plot
        sashimi_cmd = [sashimi_bin,
                        "--title",tid,
                        "--gtf",str(tx_gtf_fname),
                        "-o",str(tx_svg_fname),
                        "--normalize",
                        "--subtract","0",
                        "--intron_scale","1",
                        "--exon_scale","1",
                        "--tn",str(tn_lst_fname),
                        "--cov",str(cov_lst_fname),
                        "--sj",str(sj_lst_fname)]
        print(" ".join(sashimi_cmd))
        subprocess.call(sashimi_cmd)

        # display the svg within notebook
        display(SVG(filename=tx_svg_fname))


        # SASHIMI WITHOUT NORMALIZATION
        tx_svg_fname = prj_aux_dir / f'{tid}.non_normalized.svg'

        sashimi_cmd = [sashimi_bin,
                        "--title",tid,
                        "--gtf",str(tx_gtf_fname),
                        "-o",str(tx_svg_fname),
                        "--intron_scale","1",
                        "--exon_scale","1",
                        "--tn",str(tn_lst_fname),
                        "--cov",str(cov_lst_fname),
                        "--sj",str(sj_lst_fname)]
        print(" ".join(sashimi_cmd))
        subprocess.call(sashimi_cmd)

        # display the svg within notebook
        display(SVG(filename=tx_svg_fname))
    except Exception as e:
        print(f"Error processing {tid}: {e}")
        continue


In [None]:
# load transcriptomes for each monkey and investigate novel transcripts

In [None]:
# examine tiebrush outputs for each monkey