In [None]:
import os
import re
import sys
import csv
import time
import pysam
import random
import requests
import subprocess
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

from pysradb import SRAweb

from Bio import SeqIO

from IPython.display import SVG
from IPython.display import display

In [None]:
data_dir = Path.cwd().parent.parent.parent / 'HIV_Atlas_Creation' / 'data'
base_dir = Path.cwd().parent.parent
soft_dir = base_dir / 'soft'

prj_id = 'BoltonSIV'

outdir = base_dir / 'results' / prj_id
outdir.mkdir(parents=True, exist_ok=True)

prj_aln_dir = outdir / 'alignment'
prj_aln_dir.mkdir(parents=True, exist_ok=True)
prj_assembly_dir = outdir / 'assembly'
prj_assembly_dir.mkdir(parents=True, exist_ok=True)
prj_quant_dir = outdir / 'quantification'
prj_quant_dir.mkdir(parents=True, exist_ok=True)
prj_aux_dir = outdir / 'auxiliary'
prj_aux_dir.mkdir(parents=True, exist_ok=True)

hisat_idx_basename = base_dir / 'data/SIV239/reference'
reference_fasta_fname = base_dir / 'data/SIV239/reference.fasta'
reference_gtf_fname = base_dir / 'data/SIV239/reference.gtf'

siv239_fasta_fname = Path(Path.cwd().parent.parent.parent / "HIV_Atlas_Data/data/M33262.1/M33262.1.fasta").expanduser().resolve()
siv239_gtf_fname = base_dir / 'data/SIV239/M33262.1.gtf'
siv239_gff_fname = base_dir / 'data/SIV239/M33262.1.gff'

majiq_license_fname = soft_dir / 'majiq_license_academic_official.lic'

sivPlus_info_fname = data_dir / "info_SIVplus.tsv"

In [4]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, str(soft_dir / "genomic_scripts"))
%aimport definitions

In [5]:
metadata_df = pd.read_csv(sivPlus_info_fname, sep='\t', index_col=0).reset_index()
print(set(metadata_df["Experiment Title"].str.split(":",expand=True)[1].str.split(";",expand=True)[0].str.split(",",expand=True)[0].unique()))
metadata_df.head()

{' Memory CD4 T cell gene expression from macaque AY69 lymph node', ' Memory CD4 T cell gene expression from macaque T034 PBMC'}


Unnamed: 0,animal,sample,Cell number,SIV reads,SIV prop,SIV prop >0.01,in final filtered seurat,D1-A5 splice,tatrev splice,SIV pos,number of splice varieties seen,passed initial sc QC,Experiment Accession,Experiment Title,Sample Accession,Total Spots,Total Bases,Run
0,AY69,A1p1,1,42031,0.020604,1,1,127,0,1,1,1,SRX20459058,GSM7399872: Memory CD4 T cell gene expression ...,SRS17766270,2585301,661808315,SRR24679672
1,AY69,A2p1,1,43091,0.022958,1,1,264,841,1,2,1,SRX20458082,GSM7399879: Memory CD4 T cell gene expression ...,SRS17765295,2683173,660469952,SRR24678661
2,AY69,A6p1,1,157834,0.072799,1,1,2250,3347,1,2,1,SRX20458792,GSM7399907: Memory CD4 T cell gene expression ...,SRS17766006,2702252,664149793,SRR24679087
3,AY69,A8p1,1,123108,0.104491,1,1,1159,1089,1,2,1,SRX20458806,GSM7399921: Memory CD4 T cell gene expression ...,SRS17766018,1741331,447735007,SRR24679073
4,AY69,B12p1,1,111957,0.073472,1,1,961,1613,1,2,1,SRX20458844,GSM7399949: Memory CD4 T cell gene expression ...,SRS17766056,2032111,518745652,SRR24679461


In [None]:
# get read length stats
bams = [f for f in outdir.glob("**/SRR*M33262.1.sorted.bam")]

# Array to store read lengths
read_lengths = []

# Iterate over BAM files and collect read lengths
for bam_file in bams:
    print(f"Processing {bam_file}...")
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        for read in bam:
            if not read.is_secondary and not read.is_supplementary:
                read_lengths.append(len(read.query_sequence))

# Convert read lengths to a NumPy array
read_lengths = np.array(read_lengths)

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(read_lengths, bins=50, color='skyblue', edgecolor='black')
plt.title("Read Length Distribution", f<transcript list>ha=0.7)
plt.tight_layout()
plt.show()

In [8]:
# print distribution
print("Read Length Distribution")
print(f"Mean: {read_lengths.mean()}")
print(f"Median: {np.median(read_lengths)}")
print(f"Standard Deviation: {read_lengths.std()}")
print(f"Minimum: {read_lengths.min()}")
print(f"Maximum: {read_lengths.max()}")
print(f"25th Percentile: {np.percentile(read_lengths, 25)}")
print(f"75th Percentile: {np.percentile(read_lengths, 75)}")
print(f"mode: {np.bincount(read_lengths).argmax()}")

Read Length Distribution
Mean: 121.45015327098109
Median: 141.0
Standard Deviation: 35.18652243652309
Minimum: 35
Maximum: 151
25th Percentile: 95.0
75th Percentile: 151.0
mode: 151


In [6]:
# setup MAJIQ

# write config
config_str = """[info]
readlen=151
bamdirs="""+str(prj_aln_dir)+"""
genome=mmul10_siv239
genome_path="""+str(siv239_fasta_fname)+"""
[experiments]"""
for monkey in metadata_df["animal"].unique():
    config_str += f"\n{monkey}="
    for run_id in metadata_df[metadata_df["animal"]==monkey]["Run"].tolist():
        bam_fname = f"{run_id}.M33262.1.sorted"
        config_str += f"{bam_fname},"
    config_str = config_str[:-1]

config_fname = prj_aux_dir / "majiq_config.txt"
with open(config_fname, "w") as config_file:
    config_file.write(config_str)

In [None]:
cmd = f"majiq --license {str(majiq_license_fname)} build {str(siv239_gff_fname)} --conf {str(config_fname)} --nproc 80 --output {str(prj_aux_dir/'majiq_build')}"
print(cmd)
subprocess.call(cmd, shell=True)

In [None]:
majiq_files = {}

for monkey in metadata_df["animal"].unique():
    majiq_files[monkey] = []
    for run_id in metadata_df[metadata_df["animal"]==monkey]["Run"].tolist():
        majiq_files[monkey].append(f"{prj_aux_dir}/majiq_build/{run_id}.M33262.1.sorted.majiq")

cmd = f"majiq --license {str(majiq_license_fname)} deltapsi --nproc 80 -n AY69 T034 -grp1 {' '.join(majiq_files['AY69'])} -grp2 {' '.join(majiq_files['T034'])} --output {str(prj_aux_dir/'majiq_deltapsi')}"
print(cmd)
subprocess.call(cmd, shell=True)

In [None]:
cmd = f"voila --license {str(majiq_license_fname)} view -p 5234 {str(prj_aux_dir/'majiq_build/splicegraph.sql')} {str(prj_aux_dir/'majiq_deltapsi/AY69-T034.deltapsi.voila')}"
print(cmd)
subprocess.call(cmd, shell=True)