In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [19]:
# declarations
out_dir = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10_26022020/sim_samples/"
idx_dir = "/ccb/salz8-1/avaraby/tx_noise/indices/"
annotation_path = "/ccb/salz3/avaraby/chess/versions/curRelease/chess2.2_assembly.gtf"
hg38_fa = "/home/avaraby1/genomes/human/hg38/hg38_p8.fa"

num_tissues = 3
num_samples = 10

readlen = 101

kallisto_path = "/home/avaraby1/soft/kallisto/kallisto"
salmon_path = "/home/avaraby1/soft/salmon-latest_linux_x86_64/bin/salmon"

num_threads = 8
num_processes = 5

In [10]:
# quantify with salmon
def run_salmon(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n")
    salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                  "-i",idx_dir+"annotation.salmon",
                  "-p",str(num_threads),
                  "-o",out_dir+"slmn.real.t"+str(tn)+"_s"+str(sn),
                  "-r",out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(salmon_cmd)

    salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                  "-i",idx_dir+"annotation.salmon",
                  "-p",str(num_threads),
                  "-o",out_dir+"slmn.real_splicing.t"+str(tn)+"_s"+str(sn),
                  "-r",out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(salmon_cmd)

    salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                  "-i",idx_dir+"annotation.salmon",
                  "-p",str(num_threads),
                  "-o",out_dir+"slmn.real_intronic.t"+str(tn)+"_s"+str(sn),
                  "-r",out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(salmon_cmd)

    salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                  "-i",idx_dir+"annotation.salmon",
                  "-p",str(num_threads),
                  "-o",out_dir+"slmn.real_intergenic.t"+str(tn)+"_s"+str(sn),
                  "-r",out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(salmon_cmd)

    salmon_cmd = ["salmon","quant","--validateMappings","-l","A",
                  "-i",idx_dir+"annotation.salmon",
                  "-p",str(num_threads),
                  "-o",out_dir+"slmn.all.t"+str(tn)+"_s"+str(sn),
                  "-r",out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(salmon_cmd)

In [11]:
samples = []
for tn in range(num_tissues):
    for sn in range(num_samples):
        samples.append((tn,sn))

In [16]:
for sample in samples:
    if sample[0]<2:
        continue
    if sample[0]==2 and sample[1]<3:
        continue
    run_salmon(sample)

# pool = multiprocessing.Pool(processes=num_processes)
# pool_outputs = pool.map(run_salmon, samples)
# pool.close()
# pool.join()
# print('Pool:', pool_outputs)


Tissue #2 - Sample #3


Tissue #2 - Sample #4


Tissue #2 - Sample #5


Tissue #2 - Sample #6


Tissue #2 - Sample #7


Tissue #2 - Sample #8


Tissue #2 - Sample #9



In [17]:
# quantify with kallisto
def run_kallisto(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n")
    klst_cmd = ["kallisto","quant",
                "-i",idx_dir+"annotation.kallisto",
                "-o",out_dir+"klst.real.t"+str(tn)+"_s"+str(sn),
                "-t",str(num_threads),
                "-l",str(readlen),"-s",str(0.0001),
                "--single",out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(klst_cmd)

    klst_cmd = ["kallisto","quant",
                "-i",idx_dir+"annotation.kallisto",
                "-o",out_dir+"klst.real_splicing.t"+str(tn)+"_s"+str(sn),
                "-t",str(num_threads),
                "-l",str(readlen),"-s",str(0.0001),
                "--single",out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(klst_cmd)

    klst_cmd = ["kallisto","quant",
                "-i",idx_dir+"annotation.kallisto",
                "-o",out_dir+"klst.real_intronic.t"+str(tn)+"_s"+str(sn),
                "-t",str(num_threads),
                "-l",str(readlen),"-s",str(0.0001),
                "--single",out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(klst_cmd)

    klst_cmd = ["kallisto","quant",
                "-i",idx_dir+"annotation.kallisto",
                "-o",out_dir+"klst.real_intergenic.t"+str(tn)+"_s"+str(sn),
                "-t",str(num_threads),
                "-l",str(readlen),"-s",str(0.0001),
                "--single",out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(klst_cmd)

    klst_cmd = ["kallisto","quant",
                "-i",idx_dir+"annotation.kallisto",
                "-o",out_dir+"klst.all.t"+str(tn)+"_s"+str(sn),
                "-t",str(num_threads),
                "-l",str(readlen),"-s",str(0.0001),
                "--single",out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(klst_cmd)

In [20]:
pool = multiprocessing.Pool(processes=num_processes)
pool_outputs = pool.map(run_kallisto, samples)
pool.close()
pool.join()
print('Pool:', pool_outputs)


Tissue #0 - Sample #6

Tissue #0 - Sample #8



Tissue #0 - Sample #0

Tissue #0 - Sample #2



Tissue #0 - Sample #4


Tissue #0 - Sample #1


Tissue #0 - Sample #9


Tissue #0 - Sample #5


Tissue #0 - Sample #7


Tissue #0 - Sample #3


Tissue #1 - Sample #0


Tissue #1 - Sample #2


Tissue #1 - Sample #4


Tissue #1 - Sample #6


Tissue #1 - Sample #8


Tissue #1 - Sample #5


Tissue #1 - Sample #1


Tissue #1 - Sample #3


Tissue #1 - Sample #7


Tissue #1 - Sample #9


Tissue #2 - Sample #0


Tissue #2 - Sample #2


Tissue #2 - Sample #4


Tissue #2 - Sample #6


Tissue #2 - Sample #8


Tissue #2 - Sample #1


Tissue #2 - Sample #3


Tissue #2 - Sample #5


Tissue #2 - Sample #7


Tissue #2 - Sample #9

Pool: [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
