In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [11]:
# declarations
out_dir = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10_26022020/sim_samples/"
idx_dir = "/ccb/salz8-1/avaraby/tx_noise/indices/"
annotation_path = "/ccb/salz3/avaraby/chess/versions/curRelease/chess2.2_assembly.gtf"
hg38_fa = "/home/avaraby1/genomes/human/hg38/hg38_p8.fa"

num_tissues = 3
num_samples = 10

readlen = 101

hisat_path = "/ccb/salz8-1/avaraby/agar_manuscript/soft/AGAR2/external/hisat2/hisat2"
kallisto_path = "/home/avaraby1/soft/kallisto/kallisto"
salmon_path = "/home/avaraby1/soft/salmon-latest_linux_x86_64/bin/salmon"
sim2sam_path = "/ccb/salz8-1/avaraby/tx_noise/sim2sam/sim2sam"
stringtie2_path = "stringtie"

num_threads = 40
num_processes = 1

In [12]:
def hisat2_align(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n")
    if tn==0 and (sn==4 or sn==6):
        if not os.path.exists(out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)):
            os.makedirs(out_dir+"strg.real.t"+str(tn)+"_s"+str(sn))
        hisat_cmd = [hisat_path,
                     "-x",idx_dir+"annotation.hisat",
                     "-p",str(num_threads),
                     "--rna-sensitive","-f",
                     "-S",out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam",
                     "-U",out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
        subprocess.call(hisat_cmd)
        sort_cmd = ["samtools","sort",
                    "-@",str(num_threads),
                    "-o",out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
                    out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam"]
        subprocess.call(sort_cmd)
        strg_cmd = ["stringtie",
                    out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
                    "-p",str(num_threads),
                    "-G",annotation_path,
                    "-A",out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/strg2.genes",
                    "-o",out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/strg2.gtf"]
        subprocess.call(strg_cmd)

    #     if not os.path.exists(out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)):
    #         os.makedirs(out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn))
    #     hisat_cmd = [hisat_path,
    #                  "-x",idx_dir+"annotation.hisat",
    #                  "-p",str(num_threads),
    #                  "--rna-sensitive","-f",
    #                  "-S",out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam",
    #                  "-U",out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    #     subprocess.call(hisat_cmd)
    #     sort_cmd = ["samtools","sort",
    #                 "-@",str(num_threads),
    #                 "-o",out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam"]
    #     subprocess.call(sort_cmd)
    #     strg_cmd = ["stringtie",
    #                 out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 "-p",str(num_threads),
    #                 "-G",annotation_path,
    #                 "-A",out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/strg2.genes",
    #                 "-o",out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/strg2.gtf"]
    #     subprocess.call(strg_cmd)

    #     if not os.path.exists(out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)):
    #         os.makedirs(out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn))
    #     hisat_cmd = [hisat_path,
    #                  "-x",idx_dir+"annotation.hisat",
    #                  "-p",str(num_threads),
    #                  "--rna-sensitive","-f",
    #                  "-S",out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam",
    #                  "-U",out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    #     subprocess.call(hisat_cmd)
    #     sort_cmd = ["samtools","sort",
    #                 "-@",str(num_threads),
    #                 "-o",out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam"]
    #     subprocess.call(sort_cmd)
    #     strg_cmd = ["stringtie",
    #                 out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 "-p",str(num_threads),
    #                 "-G",annotation_path,
    #                 "-A",out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/strg2.genes",
    #                 "-o",out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/strg2.gtf"]
    #     subprocess.call(strg_cmd)

    #     if not os.path.exists(out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)):
    #         os.makedirs(out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn))
    #     hisat_cmd = [hisat_path,
    #                  "-x",idx_dir+"annotation.hisat",
    #                  "-p",str(num_threads),
    #                  "--rna-sensitive","-f",
    #                  "-S",out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam",
    #                  "-U",out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    #     subprocess.call(hisat_cmd)
    #     sort_cmd = ["samtools","sort",
    #                 "-@",str(num_threads),
    #                 "-o",out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam"]
    #     subprocess.call(sort_cmd)
    #     strg_cmd = ["stringtie",
    #                 out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 "-p",str(num_threads),
    #                 "-G",annotation_path,
    #                 "-A",out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/strg2.genes",
    #                 "-o",out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/strg2.gtf"]
    #     subprocess.call(strg_cmd)

    #     if not os.path.exists(out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)):
    #         os.makedirs(out_dir+"strg.all.t"+str(tn)+"_s"+str(sn))
    #     hisat_cmd = [hisat_path,
    #                  "-x",idx_dir+"annotation.hisat",
    #                  "-p",str(num_threads),
    #                  "--rna-sensitive","-f",
    #                  "-S",out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam",
    #                  "-U",out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    #     subprocess.call(hisat_cmd)
    #     sort_cmd = ["samtools","sort",
    #                 "-@",str(num_threads),
    #                 "-o",out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam"]
    #     subprocess.call(sort_cmd)
    #     strg_cmd = ["stringtie",
    #                 out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/hisat2.sorted.bam",
    #                 "-p",str(num_threads),
    #                 "-G",annotation_path,
    #                 "-A",out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/strg2.genes",
    #                 "-o",out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/strg2.gtf"]
    #     subprocess.call(strg_cmd)

        os.remove(out_dir+"strg.real.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam")
    #     os.remove(out_dir+"strg.real_splicing.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam")
    #     os.remove(out_dir+"strg.real_intronic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam")
    #     os.remove(out_dir+"strg.real_intergenic.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam")
    #     os.remove(out_dir+"strg.all.t"+str(tn)+"_s"+str(sn)+"/hisat2.sam")

In [13]:
samples = []
for tn in range(num_tissues):
    for sn in range(num_samples):
        samples.append((tn,sn))

In [14]:
pool = multiprocessing.Pool(processes=num_processes)
pool_outputs = pool.map(hisat2_align, samples)
pool.close()
pool.join()
print('Pool:', pool_outputs)


Tissue #0 - Sample #0


Tissue #0 - Sample #1


Tissue #0 - Sample #2


Tissue #0 - Sample #3


Tissue #0 - Sample #4


Tissue #0 - Sample #5


Tissue #0 - Sample #6


Tissue #0 - Sample #7


Tissue #0 - Sample #8


Tissue #0 - Sample #9


Tissue #1 - Sample #0


Tissue #1 - Sample #1


Tissue #1 - Sample #2


Tissue #1 - Sample #3


Tissue #1 - Sample #4


Tissue #1 - Sample #5


Tissue #1 - Sample #6


Tissue #1 - Sample #7


Tissue #1 - Sample #8


Tissue #1 - Sample #9


Tissue #2 - Sample #0


Tissue #2 - Sample #1


Tissue #2 - Sample #2


Tissue #2 - Sample #3


Tissue #2 - Sample #4


Tissue #2 - Sample #5


Tissue #2 - Sample #6


Tissue #2 - Sample #7


Tissue #2 - Sample #8


Tissue #2 - Sample #9

Pool: [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
