In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [5]:
# declarations
base_dir_data = "data/"
base_dir_out = "data/gtex_aggs/"
out_dir = "analysis_21042020/"
hg38_fa = "hg38_p12_ucsc.no_alts.no_fixs.fa"

num_tissues = 3
num_samples = 10

readlen = 101

num_threads = 4

gffread_path = "gffread"
genRNAseq_path = "soft/genRNAseq.R"
shuffleReads_path = "soft/shuffleReads.sh"

gff3cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [4]:
# first extract fasta sequences for each annotation
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        cmd_real = [gffread_path,
                    "-w",out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta",
                    "-g",hg38_fa,
                    out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        subprocess.call(cmd_real)

        cmd_nonint = [gffread_path,
                    "-w",out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta",
                    "-g",hg38_fa,
                    out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        subprocess.call(cmd_nonint)

        cmd_int = [gffread_path,
                    "-w",out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta",
                    "-g",hg38_fa,
                    out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        subprocess.call(cmd_int)

        cmd_pol = [gffread_path,
                    "-w",out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta",
                    "-g",hg38_fa,
                    out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf"]
        subprocess.call(cmd_pol)


Tissue #0

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++


Tissue #1

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++


Tissue #2

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++



In [6]:
# since gffread might have written data in the wrong order - we now need to reorganize the coverages
# such that the order of extracted transcripts is the same as the order in the annotation and the order of coverages

for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        tpm = pd.read_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        new_tpm = pd.read_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".tpm",names=["new_tpm"])
        cov = pd.read_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",names=["cov"])
        gtf = pd.read_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",names=gff3cols,sep="\t")
        gtf["tid"] = gtf["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
        exps = gtf[gtf["type"]=="transcript"].reset_index(drop=True)
        exps["tpm_gtf"] = exps["attributes"].str.split("sim_tpm=",expand=True,n=1)[1].str.split(";",expand=True,n=1)[0].astype(float)
        # first check that tpms in the gtf correspond to tpms in the .exp file
        exps = pd.concat([exps,tpm],axis=1)
        assert len(exps[~(exps["tpm"]==exps["tpm_gtf"])])==0,"incorrect tpms"
        # next we need to attach coverage and results
        exps = pd.concat([exps,cov],axis=1)
        # next we need to attach final tpms
        exps = pd.concat([exps,new_tpm],axis=1)
        exps = exps[["tid","tpm_gtf","cov","new_tpm"]]
        # next we need to load the ordering of transcripts from the gffread
        gffread_txs = []
        with open(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta") as inFP:
            for line in inFP.readlines():
                if line[0]==">":
                    gffread_txs.append(line[1:-1])
        gffread_txs_df = pd.DataFrame(gffread_txs,columns=["tid"])
        # now order the all the files according to the order imposed by gffread
        gtf = gffread_txs_df.merge(gtf,how="left",on="tid")
        # lastly need to save new data
        gtf[gff3cols].to_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".sorted.gtf",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE)
        # save the tpms and coverages now
        exps = gffread_txs_df.merge(exps,how="left",on="tid")
        exps[["tpm_gtf"]].to_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.exp",index=False,header=False)
        exps[["cov"]].to_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.cov",index=False,header=False)
        exps[["new_tpm"]].to_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.tpm",index=False,header=False)

        tpm = pd.read_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        new_tpm = pd.read_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".tpm",names=["new_tpm"])
        cov = pd.read_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",names=["cov"])
        gtf = pd.read_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",names=gff3cols,sep="\t")
        gtf["tid"] = gtf["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
        exps = gtf[gtf["type"]=="transcript"].reset_index(drop=True)
        exps["tpm_gtf"] = exps["attributes"].str.split("sim_tpm=",expand=True,n=1)[1].str.split(";",expand=True,n=1)[0].astype(float)
        # first check that tpms in the gtf correspond to tpms in the .exp file
        exps = pd.concat([exps,tpm],axis=1)
        assert len(exps[~(exps["tpm"]==exps["tpm_gtf"])])==0,"incorrect tpms"
        #next we need to attach coverage and results
        exps = pd.concat([exps,cov],axis=1)
        # next we need to attach final tpms
        exps = pd.concat([exps,new_tpm],axis=1)
        exps = exps[["tid","tpm_gtf","cov","new_tpm"]]
        # next we need to load the ordering of transcripts from the gffread
        gffread_txs = []
        with open(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta") as inFP:
            for line in inFP.readlines():
                if line[0]==">":
                    gffread_txs.append(line[1:-1])
        gffread_txs_df = pd.DataFrame(gffread_txs,columns=["tid"])
        # now order the all the files according to the order imposed by gffread
        gtf = gffread_txs_df.merge(gtf,how="left",on="tid")
        # lastly need to save new data
        gtf[gff3cols].to_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".sorted.gtf",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE)
        # save the tpms and coverages now
        exps = gffread_txs_df.merge(exps,how="left",on="tid")
        exps[["tpm_gtf"]].to_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.exp",index=False,header=False)
        exps[["cov"]].to_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.cov",index=False,header=False)
        exps[["new_tpm"]].to_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.tpm",index=False,header=False)

        tpm = pd.read_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        new_tpm = pd.read_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".tpm",names=["new_tpm"])
        cov = pd.read_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",names=["cov"])
        gtf = pd.read_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",names=gff3cols,sep="\t")
        gtf["tid"] = gtf["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
        exps = gtf[gtf["type"]=="transcript"].reset_index(drop=True)
        exps["tpm_gtf"] = exps["attributes"].str.split("sim_tpm=",expand=True,n=1)[1].str.split(";",expand=True,n=1)[0].astype(float)
        # first check that tpms in the gtf correspond to tpms in the .exp file
        exps = pd.concat([exps,tpm],axis=1)
        assert len(exps[~(exps["tpm"]==exps["tpm_gtf"])])==0,"incorrect tpms"
        #next we need to attach coverage and results
        exps = pd.concat([exps,cov],axis=1)
        # next we need to attach final tpms
        exps = pd.concat([exps,new_tpm],axis=1)
        exps = exps[["tid","tpm_gtf","cov","new_tpm"]]
        # next we need to load the ordering of transcripts from the gffread
        gffread_txs = []
        with open(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta") as inFP:
            for line in inFP.readlines():
                if line[0]==">":
                    gffread_txs.append(line[1:-1])
        gffread_txs_df = pd.DataFrame(gffread_txs,columns=["tid"])
        # now order the all the files according to the order imposed by gffread
        gtf = gffread_txs_df.merge(gtf,how="left",on="tid")
        # lastly need to save new data
        gtf[gff3cols].to_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".sorted.gtf",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE)
        # save the tpms and coverages now
        exps = gffread_txs_df.merge(exps,how="left",on="tid")
        exps[["tpm_gtf"]].to_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.exp",index=False,header=False)
        exps[["cov"]].to_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.cov",index=False,header=False)
        exps[["new_tpm"]].to_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.tpm",index=False,header=False)

        tpm = pd.read_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        new_tpm = pd.read_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".tpm",names=["new_tpm"])
        cov = pd.read_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",names=["cov"])
        gtf = pd.read_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",names=gff3cols,sep="\t")
        gtf["tid"] = gtf["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
        exps = gtf[gtf["type"]=="transcript"].reset_index(drop=True)
        exps["tpm_gtf"] = exps["attributes"].str.split("sim_tpm=",expand=True,n=1)[1].str.split(";",expand=True,n=1)[0].astype(float)
        # first check that tpms in the gtf correspond to tpms in the .exp file
        exps = pd.concat([exps,tpm],axis=1)
        assert len(exps[~(exps["tpm"]==exps["tpm_gtf"])])==0,"incorrect tpms"
        #next we need to attach coverage and results
        exps = pd.concat([exps,cov],axis=1)
        # next we need to attach final tpms
        exps = pd.concat([exps,new_tpm],axis=1)
        exps = exps[["tid","tpm_gtf","cov","new_tpm"]]
        # next we need to load the ordering of transcripts from the gffread
        gffread_txs = []
        with open(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".fasta") as inFP:
            for line in inFP.readlines():
                if line[0]==">":
                    gffread_txs.append(line[1:-1])
        gffread_txs_df = pd.DataFrame(gffread_txs,columns=["tid"])
        # now order the all the files according to the order imposed by gffread
        gtf = gffread_txs_df.merge(gtf,how="left",on="tid")
        # lastly need to save new data
        gtf[gff3cols].to_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".sorted.gtf",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE)
        # save the tpms and coverages now
        exps = gffread_txs_df.merge(exps,how="left",on="tid")
        exps[["tpm_gtf"]].to_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.exp",index=False,header=False)
        exps[["cov"]].to_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.cov",index=False,header=False)
        exps[["new_tpm"]].to_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".ordered.tpm",index=False,header=False)


Tissue #0

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++


Tissue #1

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++


Tissue #2

++++++
>Sample #0
++++++

++++++
>Sample #1
++++++

++++++
>Sample #2
++++++

++++++
>Sample #3
++++++

++++++
>Sample #4
++++++

++++++
>Sample #5
++++++

++++++
>Sample #6
++++++

++++++
>Sample #7
++++++

++++++
>Sample #8
++++++

++++++
>Sample #9
++++++



In [7]:
# gen sample
def gen_sample(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n")
    if not os.path.exists(out_dir+"real.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"real.t"+str(tn)+"_s"+str(sn))
    cmd_real = [genRNAseq_path,
                out_dir+"real.t"+str(tn)+"_s"+str(sn)+".fasta",
                out_dir+"real.t"+str(tn)+"_s"+str(sn)+".ordered.cov",
                str(readlen),
                out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/",
                "0"]
    subprocess.call(cmd_real)

    if not os.path.exists(out_dir+"splicing.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"splicing.t"+str(tn)+"_s"+str(sn))
    cmd_nonint = [genRNAseq_path,
                out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+".fasta",
                out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+".ordered.cov",
                str(readlen),
                out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+"/",
                "0"]
    subprocess.call(cmd_nonint)

    if not os.path.exists(out_dir+"intronic.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"intronic.t"+str(tn)+"_s"+str(sn))
    cmd_int = [genRNAseq_path,
                out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+".fasta",
                out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+".ordered.cov",
                str(readlen),
                out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+"/",
                "0"]
    subprocess.call(cmd_int)

    if not os.path.exists(out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"intergenic.t"+str(tn)+"_s"+str(sn))
    cmd_pol = [genRNAseq_path,
                out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+".fasta",
                out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+".ordered.cov",
                str(readlen),
                out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+"/",
                "0"]
    subprocess.call(cmd_pol)
    
    return 1

In [8]:
samples = []
for tn in range(num_tissues):
    for sn in range(num_samples):
        samples.append((tn,sn))

In [None]:
pool = multiprocessing.Pool(processes=num_threads)
pool_outputs = pool.map(gen_sample, samples)
pool.close()
pool.join()
print('Pool:', pool_outputs)

In [10]:
def shuffle_combine(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n")       
    shuffle_cmd = [shuffleReads_path,
                   out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta",
                   out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(shuffle_cmd)

    # create a combination of real and nonint
    if not os.path.exists(out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn))
    cat_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta "+\
                      out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta > "+\
                      out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta"]
    subprocess.call(cat_cmd,shell=True)
    shuffle_cmd = [shuffleReads_path,
                   out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta",
                   out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(shuffle_cmd)

    # create a combination of real and int
    if not os.path.exists(out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn))
    cat_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta "+\
                      out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta > "+\
                      out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta"]
    subprocess.call(cat_cmd,shell=True)
    shuffle_cmd = [shuffleReads_path,
                   out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta",
                   out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(shuffle_cmd)

    # create a combination of real and polymerase
    if not os.path.exists(out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn))
    cat_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta "+\
                      out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta > "+\
                      out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta"]
    subprocess.call(cat_cmd,shell=True)
    shuffle_cmd = [shuffleReads_path,
                   out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta",
                   out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(shuffle_cmd)

    # create a combination of all reads
    if not os.path.exists(out_dir+"all.t"+str(tn)+"_s"+str(sn)):
        os.makedirs(out_dir+"all.t"+str(tn)+"_s"+str(sn))
    cat_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta "+\
                      out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta "+\
                      out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta "+\
                      out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta > "+\
                      out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta"]
    subprocess.call(cat_cmd,shell=True)
    shuffle_cmd = [shuffleReads_path,
                   out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta",
                   out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta"]
    subprocess.call(shuffle_cmd)

    os.remove(out_dir+"real.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")
#     os.remove(out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")
#     os.remove(out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")
#     os.remove(out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")
    os.remove(out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")
    os.remove(out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")
    os.remove(out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")
    os.remove(out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.fasta")

In [11]:
pool = multiprocessing.Pool(processes=num_threads)
pool_outputs = pool.map(shuffle_combine, samples)
pool.close()
pool.join()
print('Pool:', pool_outputs)


Tissue #0 - Sample #2

Tissue #0 - Sample #4



Tissue #0 - Sample #0

Tissue #0 - Sample #6



Tissue #0 - Sample #3


Tissue #0 - Sample #5


Tissue #0 - Sample #7


Tissue #0 - Sample #1


Tissue #0 - Sample #8


Tissue #1 - Sample #0


Tissue #1 - Sample #2


Tissue #1 - Sample #4


Tissue #1 - Sample #1


Tissue #0 - Sample #9


Tissue #1 - Sample #3


Tissue #1 - Sample #5


Tissue #1 - Sample #6


Tissue #1 - Sample #8


Tissue #2 - Sample #0


Tissue #2 - Sample #2


Tissue #1 - Sample #7


Tissue #1 - Sample #9


Tissue #2 - Sample #3


Tissue #2 - Sample #1


Tissue #2 - Sample #4


Tissue #2 - Sample #6


Tissue #2 - Sample #8


Tissue #2 - Sample #5


Tissue #2 - Sample #9


Tissue #2 - Sample #7

Pool: [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [12]:
def combine_gffs(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n")
    merge_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       " > "+out_dir+"real_splicing.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    subprocess.call(merge_cmd,shell=True)

    merge_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       " > "+out_dir+"real_intronic.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    subprocess.call(merge_cmd,shell=True)

    merge_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       " > "+out_dir+"real_intergenic.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    subprocess.call(merge_cmd,shell=True)

    merge_cmd = ["cat "+out_dir+"real.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+".sorted.gtf "+\
                       " > "+out_dir+"all.t"+str(tn)+"_s"+str(sn)+".sorted.gtf"]
    subprocess.call(merge_cmd,shell=True)        

In [13]:
pool = multiprocessing.Pool(processes=num_threads)
pool_outputs = pool.map(combine_gffs, samples)
pool.close()
pool.join()
print('Pool:', pool_outputs)


Tissue #0 - Sample #6


Tissue #0 - Sample #0

Tissue #0 - Sample #2

Tissue #0 - Sample #4




Tissue #0 - Sample #3


Tissue #0 - Sample #1


Tissue #0 - Sample #7


Tissue #0 - Sample #5


Tissue #0 - Sample #8


Tissue #1 - Sample #0


Tissue #1 - Sample #2


Tissue #1 - Sample #4


Tissue #1 - Sample #1


Tissue #0 - Sample #9


Tissue #1 - Sample #3


Tissue #1 - Sample #5


Tissue #1 - Sample #6


Tissue #1 - Sample #8


Tissue #2 - Sample #0


Tissue #2 - Sample #2


Tissue #1 - Sample #7


Tissue #1 - Sample #9


Tissue #2 - Sample #1


Tissue #2 - Sample #3


Tissue #2 - Sample #4


Tissue #2 - Sample #6


Tissue #2 - Sample #8


Tissue #2 - Sample #5


Tissue #2 - Sample #7


Tissue #2 - Sample #9

Pool: [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
