In [4]:
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [10]:
# declarations
out_dir = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10_26022020/sim_samples/"
idx_dir = "/ccb/salz8-1/avaraby/tx_noise/indices/"
annotation_path = "/ccb/salz3/avaraby/chess/versions/curRelease/chess2.2_assembly.gtf"

num_tissues = 3
num_samples = 10

readlen = 101

hisat_path = "/ccb/salz8-1/avaraby/agar_manuscript/soft/AGAR2/external/hisat2/hisat2"
kallisto_path = "/home/avaraby1/soft/kallisto/kallisto"
salmon_path = "/home/avaraby1/soft/salmon-latest_linux_x86_64/bin/salmon"
sim2sam_path = "/ccb/salz8-1/avaraby/tx_noise/sim2sam/sim2sam"
stringtie2_path = "stringtie"

num_threads = 24

In [9]:
# convert with sim2sam
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        if not os.path.exists(out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)):
            os.makedirs(out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num))
        plst_cmd = [sim2sam_path,
                    "-i",hg38_fa+".fai",
                    "-o",out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/sim2sam.sam",
                    "-s",out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta",
                    "-g",out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        plst_cmd.extend(["-t","polyester"])
        subprocess.call(plst_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(num_threads),
                    "-o",out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/sim2sam.sorted.bam",
                    out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/sim2sam.sam"]
        subprocess.call(sort_cmd)

        if not os.path.exists(out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)):
            os.makedirs(out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num))
        plst_cmd = [sim2sam_path,
                    "-i",hg38_fa+".fai",
                    "-o",out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/sim2sam.sam",
                    "-s",out_dir+"real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta",
                    "-g",out_dir+"real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        plst_cmd.extend(["-t","polyester"])
        subprocess.call(plst_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(num_threads),
                    "-o",out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/sim2sam.sorted.bam",
                    out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/sim2sam.sam"]
        subprocess.call(sort_cmd)

        if not os.path.exists(out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)):
            os.makedirs(out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num))
        plst_cmd = [sim2sam_path,
                    "-i",hg38_fa+".fai",
                    "-o",out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"sim2sam.sam",
                    "-s",out_dir+"real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta",
                    "-g",out_dir+"real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        plst_cmd.extend(["-t","polyester"])
        subprocess.call(plst_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(num_threads),
                    "-o",out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"sim2sam.sorted.bam",
                    out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"sim2sam.sam"]
        subprocess.call(sort_cmd)

        if not os.path.exists(out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)):
            os.makedirs(out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num))
        plst_cmd = [sim2sam_path,
                    "-i",hg38_fa+".fai",
                    "-o",out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"sim2sam.sam",
                    "-s",out_dir+"all.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta",
                    "-g",out_dir+"all.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        plst_cmd.extend(["-t","polyester"])
        subprocess.call(plst_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(num_threads),
                    "-o",out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"sim2sam.sorted.bam",
                    out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"sim2sam.sam"]
        subprocess.call(sort_cmd)


Tissue #0

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++


Tissue #1

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++


Tissue #2

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++



In [None]:
# align with hisat2
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        hisat_cmd = [hisat,
                     "-x",idx_dir+"annotation.hisat",
                     "-p",str(threads),
                     "--rna-sensitive","-f",
                     "-S",out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam",
                     "-U",out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(hisat_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(threads),
                    "-o",out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam"]
        subprocess.call(sort_cmd)
        
        hisat_cmd = [hisat,
                     "-x",idx_dir+"annotation.hisat",
                     "-p",str(threads),
                     "--rna-sensitive","-f",
                     "-S",out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam",
                     "-U",out_dir+"real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(hisat_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(threads),
                    "-o",out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam"]
        subprocess.call(sort_cmd)
        
        hisat_cmd = [hisat,
                     "-x",idx_dir+"annotation.hisat",
                     "-p",str(threads),
                     "--rna-sensitive","-f",
                     "-S",out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam",
                     "-U",out_dir+"real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(hisat_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(threads),
                    "-o",out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam"]
        subprocess.call(sort_cmd)
        
        hisat_cmd = [hisat,
                     "-x",idx_dir+"annotation.hisat",
                     "-p",str(threads),
                     "--rna-sensitive","-f",
                     "-S",out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam",
                     "-U",out_dir+"all.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(hisat_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(threads),
                    "-o",out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sam"]
        subprocess.call(sort_cmd)        


Tissue #0

++++++
>Sample #0
++++++



In [None]:
# assemble with stringtie
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        strg_cmd = ["stringtie",
                    out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    "-p",str(threads),
                    "-G",annotation_path,
                    "-o",out_dir+"strg.real.t"+str(tissue_num)+"_s"+str(sample_num)+"/strg2.gtf"]
        
        strg_cmd = ["stringtie",
                    out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    "-p",str(threads),
                    "-G",annotation_path,
                    "-o",out_dir+"strg.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/strg2.gtf"]
        
        strg_cmd = ["stringtie",
                    out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    "-p",str(threads),
                    "-G",annotation_path,
                    "-o",out_dir+"strg.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/strg2.gtf"]
        
        strg_cmd = ["stringtie",
                    out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"/hisat2.sorted.bam",
                    "-p",str(threads),
                    "-G",annotation_path,
                    "-o",out_dir+"strg.all.t"+str(tissue_num)+"_s"+str(sample_num)+"/strg2.gtf"]

In [None]:
# quantify with salmon
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                      "-i",idx_dir+"annotation.salmon",
                      "-p",str(threads),
                      "-o",out_dir+"slmn.real.t"+str(tissue_num)+"_s"+str(sample_num),
                      "-r",out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(salmon_cmd)
        
        salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                      "-i",idx_dir+"annotation.salmon",
                      "-p",str(threads),
                      "-o",out_dir+"slmn.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num),
                      "-r",out_dir+"real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(salmon_cmd)
        
        salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                      "-i",idx_dir+"annotation.salmon",
                      "-p",str(threads),
                      "-o",out_dir+"slmn.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num),
                      "-r",out_dir+"real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(salmon_cmd)
        
        salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                      "-i",idx_dir+"annotation.salmon",
                      "-p",str(threads),
                      "-o",out_dir+"slmn.all.t"+str(tissue_num)+"_s"+str(sample_num),
                      "-r",out_dir+"all.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(salmon_cmd)

In [None]:
# quantify with kallisto
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        klst_cmd = ["kallisto","quant",
                    "-i",idx_dir+"annotation.kallisto",
                    "-o",out_dir+"klst.real.t"+str(tissue_num)+"_s"+str(sample_num),
                    "-t",str(threads),
                    "-l",str(250),"-s",str(25),
                    "--single",out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(klst_cmd)
        
        klst_cmd = ["kallisto","quant",
                    "-i",idx_dir+"annotation.kallisto",
                    "-o",out_dir+"klst.real_splicing.t"+str(tissue_num)+"_s"+str(sample_num),
                    "-t",str(threads),
                    "-l",str(250),"-s",str(25),
                    "--single",out_dir+"real_splicing.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(klst_cmd)
        
        klst_cmd = ["kallisto","quant",
                    "-i",idx_dir+"annotation.kallisto",
                    "-o",out_dir+"klst.real_intronic.t"+str(tissue_num)+"_s"+str(sample_num),
                    "-t",str(threads),
                    "-l",str(250),"-s",str(25),
                    "--single",out_dir+"real_intronic.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(klst_cmd)
        
        klst_cmd = ["kallisto","quant",
                    "-i",idx_dir+"annotation.kallisto",
                    "-o",out_dir+"klst.all.t"+str(tissue_num)+"_s"+str(sample_num),
                    "-t",str(threads),
                    "-l",str(250),"-s",str(25),
                    "--single",out_dir+"all.t"+str(tissue_num)+"_s"+str(sample_num)+"/sample_01.shuffled.fasta"]
        subprocess.call(klst_cmd)