In [3]:
import multiprocessing
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [4]:
# declarations
out_dir = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10_26022020/sim_samples/"
idx_dir = "/ccb/salz8-1/avaraby/tx_noise/indices/"
hg38_fa = "/home/avaraby1/genomes/human/hg38/hg38_p8.fa"

num_tissues = 3
num_samples = 10

readlen = 101

hisat_path = "/ccb/salz8-1/avaraby/agar_manuscript/soft/AGAR2/external/hisat2/hisat2"
kallisto_path = "/home/avaraby1/soft/kallisto/kallisto"
salmon_path = "/home/avaraby1/soft/salmon-latest_linux_x86_64/bin/salmon"
sim2sam_path = "/ccb/salz8-1/avaraby/tx_noise/sim2sam/sim2sam"
stringtie2_path = "stringtie"

num_threads = 5

In [5]:
def gen_sam(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n") 
    if not os.path.exists(out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"strg.real.t"+str(tn)+"_s"+str(sn))
    plst_cmd = [sim2sam_path,
                "-i",hg38_fa+".fai",
                "-o",out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam",
                "-s",out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta",
                "-g",out_dir+"real.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    plst_cmd.extend(["-t","polyester"])
    subprocess.call(plst_cmd)
    sort_cmd = ["samtools","sort",
                "-@",str(num_threads),
                "-o",out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sorted.bam",
                out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam"]
    subprocess.call(sort_cmd)

    if not os.path.exists(out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn))
    plst_cmd = [sim2sam_path,
                "-i",hg38_fa+".fai",
                "-o",out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam",
                "-s",out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta",
                "-g",out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    plst_cmd.extend(["-t","polyester"])
    subprocess.call(plst_cmd)
    sort_cmd = ["samtools","sort",
                "-@",str(num_threads),
                "-o",out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sorted.bam",
                out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam"]
    subprocess.call(sort_cmd)

    if not os.path.exists(out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn))
    plst_cmd = [sim2sam_path,
                "-i",hg38_fa+".fai",
                "-o",out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam",
                "-s",out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta",
                "-g",out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    plst_cmd.extend(["-t","polyester"])
    subprocess.call(plst_cmd)
    sort_cmd = ["samtools","sort",
                "-@",str(num_threads),
                "-o",out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sorted.bam",
                out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam"]
    subprocess.call(sort_cmd)

    if not os.path.exists(out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn))
    plst_cmd = [sim2sam_path,
                "-i",hg38_fa+".fai",
                "-o",out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam",
                "-s",out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta",
                "-g",out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    plst_cmd.extend(["-t","polyester"])
    subprocess.call(plst_cmd)
    sort_cmd = ["samtools","sort",
                "-@",str(num_threads),
                "-o",out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sorted.bam",
                out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam"]
    subprocess.call(sort_cmd)

    if not os.path.exists(out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"strg.all.t"+str(tn)+"_s"+str(sn))
    plst_cmd = [sim2sam_path,
                "-i",hg38_fa+".fai",
                "-o",out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam",
                "-s",out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta",
                "-g",out_dir+"all.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    plst_cmd.extend(["-t","polyester"])
    subprocess.call(plst_cmd)
    sort_cmd = ["samtools","sort",
                "-@",str(num_threads),
                "-o",out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sorted.bam",
                out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam"]
    subprocess.call(sort_cmd)

    os.remove(out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam")
    os.remove(out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam")
    os.remove(out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam")
    os.remove(out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam")
    os.remove(out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/sim2sam.sam")

In [6]:
samples = []
for tn in range(num_tissues):
    for sn in range(num_samples):
        samples.append((tn,sn))

In [7]:
pool = multiprocessing.Pool(processes=num_threads)
pool_outputs = pool.map(gen_sam, samples)
pool.close()
pool.join()
print('Pool:', pool_outputs)


Tissue #0 - Sample #0

Tissue #0 - Sample #4

Tissue #0 - Sample #6

Tissue #0 - Sample #2

Tissue #0 - Sample #8






Tissue #0 - Sample #5


Tissue #0 - Sample #1


Tissue #0 - Sample #9


Tissue #0 - Sample #7


Tissue #0 - Sample #3


Tissue #1 - Sample #0


Tissue #1 - Sample #2


Tissue #1 - Sample #4


Tissue #1 - Sample #6


Tissue #1 - Sample #8


Tissue #1 - Sample #5


Tissue #1 - Sample #1


Tissue #1 - Sample #3


Tissue #1 - Sample #7


Tissue #1 - Sample #9


Tissue #2 - Sample #0


Tissue #2 - Sample #2


Tissue #2 - Sample #4


Tissue #2 - Sample #6


Tissue #2 - Sample #8


Tissue #2 - Sample #1


Tissue #2 - Sample #3


Tissue #2 - Sample #5


Tissue #2 - Sample #9


Tissue #2 - Sample #7

Pool: [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
