In [2]:
import multiprocessing
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [3]:
# declarations
out_dir = "/ccb/salz8-1/avaraby/tx_noise/analysis_21042020/"

num_tissues = 3
num_samples = 10

num_processes = 4

gff3cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

# bcbioRnaseq - both salmon and kallisto can be aggregated to the gene-level
# https://github.com/bcbio/bcbio-nextgen/issues/2077

In [4]:
def get_gene_res(ts):
    tn = ts[0]
    sn = ts[1]
    print("\n=================\nTissue #"+str(tn)+" - Sample #"+str(sn)+"\n=================\n")
    tx_nr = dict()

    # first we need to get the number of simulated reads per each gene for each group
    with open(out_dir+"all.t"+str(tn)+"_s"+str(sn)+"/sample_01.shuffled.fasta","r") as inFP:
        for line in inFP.readlines():
            if line[0]==">":
                tid = line.split("/")[1].split(";")[0]
                tx_nr[tid] = tx_nr.setdefault(tid,0)+1
                
    print("loaded fasta")

    # now build respective dataframes
    counts = pd.DataFrame(tx_nr,index=[0]).T.reset_index()
    counts.columns = ["tid","nr"]

    # now need to load the base GTF to link intronic and splicing reads to the respective genes
    real = pd.read_csv(out_dir+"real.t"+str(tn)+"_s"+str(sn)+".gtf",sep="\t",names=gff3cols)
    real = real[real["type"]=="transcript"].reset_index(drop=True)
    real["gid"] = real["attributes"].str.split("gene_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    real["tid"] = real["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    real = real[["gid","tid"]]
    real["type"] = "real"

    splicing = pd.read_csv(out_dir+"splicing.t"+str(tn)+"_s"+str(sn)+".gtf",sep="\t",names=gff3cols)
    splicing = splicing[splicing["type"]=="transcript"].reset_index(drop=True)
    splicing["gid"] = splicing["attributes"].str.split("gene_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    splicing["tid"] = splicing["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    splicing = splicing[["gid","tid"]]
    splicing["type"] = "noise"

    intronic = pd.read_csv(out_dir+"intronic.t"+str(tn)+"_s"+str(sn)+".gtf",sep="\t",names=gff3cols)
    intronic = intronic[intronic["type"]=="transcript"].reset_index(drop=True)
    intronic["gid"] = intronic["attributes"].str.split("gene_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    intronic["tid"] = intronic["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    intronic = intronic[["gid","tid"]]
    intronic["type"] = "noise"

    intergenic = pd.read_csv(out_dir+"intergenic.t"+str(tn)+"_s"+str(sn)+".gtf",sep="\t",names=gff3cols)
    intergenic = intergenic[intergenic["type"]=="transcript"].reset_index(drop=True)
    intergenic["gid"] = intergenic["attributes"].str.split("gene_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    intergenic["tid"] = intergenic["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    intergenic = intergenic[["gid","tid"]]
    intergenic["type"] = "noise"
    
    print("loaded gtf")

    # aggregate the 
    all_df = pd.concat([real,splicing,intronic,intergenic],axis=0)

    # now need to add counts to the transcripts
    all_df = all_df.merge(counts,how="outer",indicator=True,on="tid")
    assert len(all_df[all_df["_merge"]=="right_only"])==0,"unidentified transcripts found"
    all_df.dropna(axis=0,inplace=True)
    all_df.drop(["_merge","tid"],axis=1,inplace=True)
    
    print("added counts")

    # now need to aggregate the results by gene and type
    all_df = all_df.groupby(by=["gid","type"]).sum().reset_index()

    real = all_df[all_df["type"]=="real"].reset_index(drop=True)
    real.drop("type",axis=1,inplace=True)
    real.columns = ["gid","nr_real"]
    noise = all_df[all_df["type"]=="noise"].reset_index(drop=True)
    noise.drop("type",axis=1,inplace=True)
    noise.columns = ["gid","nr_noise"]
    all_df = real.merge(noise,how="outer",on="gid")

    all_df.replace(np.nan,0,inplace=True)
    all_df["nr_total"] = all_df["nr_real"]+all_df["nr_noise"]
    all_df["frac_real"] = all_df["nr_real"]/all_df["nr_total"]
    
    print("aggregated")

    # now need to load the results
    res_real = pd.read_csv(out_dir+"real.t"+str(tn)+"_s"+str(sn)+".res")
    # again need to aggregate the res_realults for gene level
    res_real["gid"] = "CHS."+res_real["tid"].str.split(".",expand=True)[1]
    res_real = res_real[["gid","sim_nreads","strg_nreads","slmn_nreads","klst_nreads"]]
    res_real = res_real.groupby(by="gid").sum().reset_index()
    res_real.columns = ["gid","sim_nreads","strg_nreads_real","slmn_nreads_real","klst_nreads_real"]

    res_all = pd.read_csv(out_dir+"all.t"+str(tn)+"_s"+str(sn)+".res")
    # again need to aggregate the res_allults for gene level
    res_all["gid"] = "CHS."+res_all["tid"].str.split(".",expand=True)[1]
    res_all = res_all[["gid","sim_nreads","strg_nreads","slmn_nreads","klst_nreads"]]
    res_all = res_all.groupby(by="gid").sum().reset_index()
    res_all.columns = ["gid","sim_nreads_all","strg_nreads_all","slmn_nreads_all","klst_nreads_all"]


    # now we can combine this data with the fractions
    res = res_real.merge(res_all,how="outer",on="gid",indicator=True)
    assert len(res[res["_merge"]=="both"])==len(res),"unidentified genes"
    assert len(res[res["sim_nreads"]==res["sim_nreads_all"]])==len(res),"non-matching number of reads"
    res.drop(["sim_nreads_all","_merge"],axis=1,inplace=True)
    
    print("combined")

    # now we can separate false positives and false negatives
    fp_strg_real = res[(res["sim_nreads"]==0)&(res["strg_nreads_real"]>0)].reset_index(drop=True)
    fp_slmn_real = res[(res["sim_nreads"]==0)&(res["slmn_nreads_real"]>0)].reset_index(drop=True)
    fp_klst_real = res[(res["sim_nreads"]==0)&(res["klst_nreads_real"]>0)].reset_index(drop=True)

    fp_strg_all = res[(res["sim_nreads"]==0)&(res["strg_nreads_all"]>0)].reset_index(drop=True)
    fp_slmn_all = res[(res["sim_nreads"]==0)&(res["slmn_nreads_all"]>0)].reset_index(drop=True)
    fp_klst_all = res[(res["sim_nreads"]==0)&(res["klst_nreads_all"]>0)].reset_index(drop=True)

    fn_strg_real = res[(res["sim_nreads"]>0)&(res["strg_nreads_real"]==0)].reset_index(drop=True)
    fn_slmn_real = res[(res["sim_nreads"]>0)&(res["slmn_nreads_real"]==0)].reset_index(drop=True)
    fn_klst_real = res[(res["sim_nreads"]>0)&(res["klst_nreads_real"]==0)].reset_index(drop=True)

    fn_strg_all = res[(res["sim_nreads"]>0)&(res["strg_nreads_all"]==0)].reset_index(drop=True)
    fn_slmn_all = res[(res["sim_nreads"]>0)&(res["slmn_nreads_all"]==0)].reset_index(drop=True)
    fn_klst_all = res[(res["sim_nreads"]>0)&(res["klst_nreads_all"]==0)].reset_index(drop=True)

    fp_strg_real.to_csv(out_dir+"fp_strg_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fp_slmn_real.to_csv(out_dir+"fp_slmn_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fp_klst_real.to_csv(out_dir+"fp_klst_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)

    fp_strg_all.to_csv(out_dir+"fp_strg_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fp_slmn_all.to_csv(out_dir+"fp_slmn_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fp_klst_all.to_csv(out_dir+"fp_klst_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)

    fn_strg_real.to_csv(out_dir+"fn_strg_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fn_slmn_real.to_csv(out_dir+"fn_slmn_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fn_klst_real.to_csv(out_dir+"fn_klst_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)

    fn_strg_all.to_csv(out_dir+"fn_strg_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fn_slmn_all.to_csv(out_dir+"fn_slmn_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    fn_klst_all.to_csv(out_dir+"fn_klst_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    
    print("saved")

    # now we need to add fractions
    strg_real = res[(res["sim_nreads"]>0)&(res["strg_nreads_real"]>0)][["gid","sim_nreads","strg_nreads_real"]].reset_index(drop=True)
    slmn_real = res[(res["sim_nreads"]>0)&(res["slmn_nreads_real"]>0)][["gid","sim_nreads","slmn_nreads_real"]].reset_index(drop=True)
    klst_real = res[(res["sim_nreads"]>0)&(res["klst_nreads_real"]>0)][["gid","sim_nreads","klst_nreads_real"]].reset_index(drop=True)

    strg_all = res[(res["sim_nreads"]>0)&(res["strg_nreads_all"]>0)][["gid","sim_nreads","strg_nreads_all"]].reset_index(drop=True)
    slmn_all = res[(res["sim_nreads"]>0)&(res["slmn_nreads_all"]>0)][["gid","sim_nreads","slmn_nreads_all"]].reset_index(drop=True)
    klst_all = res[(res["sim_nreads"]>0)&(res["klst_nreads_all"]>0)][["gid","sim_nreads","klst_nreads_all"]].reset_index(drop=True)

    strg_real = strg_real.merge(all_df[["gid","frac_real"]],how="outer",on="gid",indicator=True)
    assert len(strg_real[strg_real["_merge"]=="left_only"])==0,"unidentified transcripts"
    strg_real.drop("_merge",axis=1,inplace=True)

    slmn_real = slmn_real.merge(all_df[["gid","frac_real"]],how="outer",on="gid",indicator=True)
    assert len(slmn_real[slmn_real["_merge"]=="left_only"])==0,"unidentified transcripts"
    slmn_real.drop("_merge",axis=1,inplace=True)

    klst_real = klst_real.merge(all_df[["gid","frac_real"]],how="outer",on="gid",indicator=True)
    assert len(klst_real[klst_real["_merge"]=="left_only"])==0,"unidentified transcripts"
    klst_real.drop("_merge",axis=1,inplace=True)

    strg_all = strg_all.merge(all_df[["gid","frac_real"]],how="outer",on="gid",indicator=True)
    assert len(strg_all[strg_all["_merge"]=="left_only"])==0,"unidentified transcripts"
    strg_all.drop("_merge",axis=1,inplace=True)

    slmn_all = slmn_all.merge(all_df[["gid","frac_real"]],how="outer",on="gid",indicator=True)
    assert len(slmn_all[slmn_all["_merge"]=="left_only"])==0,"unidentified transcripts"
    slmn_all.drop("_merge",axis=1,inplace=True)

    klst_all = klst_all.merge(all_df[["gid","frac_real"]],how="outer",on="gid",indicator=True)
    assert len(klst_all[klst_all["_merge"]=="left_only"])==0,"unidentified transcripts"
    klst_all.drop("_merge",axis=1,inplace=True)
    
    print("got fractions")

    # compute fold change
    strg_real["fold"] = (strg_real["strg_nreads_real"]-strg_real["sim_nreads"])/strg_real["sim_nreads"]
    strg_real.drop(["sim_nreads","strg_nreads_real"],axis=1,inplace=True)
    strg_real.dropna(axis=0,inplace=True)
    strg_real = strg_real.round({'frac_real':1})
    slmn_real["fold"] = (slmn_real["slmn_nreads_real"]-slmn_real["sim_nreads"])/slmn_real["sim_nreads"]
    slmn_real.drop(["sim_nreads","slmn_nreads_real"],axis=1,inplace=True)
    slmn_real.dropna(axis=0,inplace=True)
    slmn_real = slmn_real.round({'frac_real':1})
    klst_real["fold"] = (klst_real["klst_nreads_real"]-klst_real["sim_nreads"])/klst_real["sim_nreads"]
    klst_real.drop(["sim_nreads","klst_nreads_real"],axis=1,inplace=True)
    klst_real.dropna(axis=0,inplace=True)
    klst_real = klst_real.round({'frac_real':1})

    strg_all["fold"] = (strg_all["strg_nreads_all"]-strg_all["sim_nreads"])/strg_all["sim_nreads"]
    strg_all.drop(["sim_nreads","strg_nreads_all"],axis=1,inplace=True)
    strg_all.dropna(axis=0,inplace=True)
    strg_all = strg_all.round({'frac_real':1})
    slmn_all["fold"] = (slmn_all["slmn_nreads_all"]-slmn_all["sim_nreads"])/slmn_all["sim_nreads"]
    slmn_all.drop(["sim_nreads","slmn_nreads_all"],axis=1,inplace=True)
    slmn_all.dropna(axis=0,inplace=True)
    slmn_all = slmn_all.round({'frac_real':1})
    klst_all["fold"] = (klst_all["klst_nreads_all"]-klst_all["sim_nreads"])/klst_all["sim_nreads"]
    klst_all.drop(["sim_nreads","klst_nreads_all"],axis=1,inplace=True)
    klst_all.dropna(axis=0,inplace=True)
    klst_all = klst_all.round({'frac_real':1})
    
    print("computed fold and saving")

    strg_real.to_csv(out_dir+"strg_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    strg_all.to_csv(out_dir+"strg_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)

    slmn_real.to_csv(out_dir+"slmn_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    slmn_all.to_csv(out_dir+"slmn_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)

    klst_real.to_csv(out_dir+"klst_real.t"+str(tn)+"_s"+str(sn)+".csv",index=False)
    klst_all.to_csv(out_dir+"klst_all.t"+str(tn)+"_s"+str(sn)+".csv",index=False)

In [5]:
samples = []
for tn in range(num_tissues):
    for sn in range(num_samples):
        samples.append((tn,sn))

In [6]:
pool = multiprocessing.Pool(processes=num_processes)
pool_outputs = pool.map(get_gene_res, samples)
pool.close()
pool.join()
print('Pool:', pool_outputs)


Tissue #0 - Sample #2

Tissue #0 - Sample #4

Tissue #0 - Sample #0

Tissue #0 - Sample #6




loaded fasta
loaded gtf
added counts
aggregated
combined
saved
got fractions
computed fold and saving

Tissue #0 - Sample #3

loaded fasta
loaded gtf
added counts
aggregated
loaded fasta
combined
saved
got fractions
computed fold and saving

Tissue #0 - Sample #5

loaded gtf
added counts
aggregated
combined
saved
got fractions
computed fold and saving

Tissue #0 - Sample #7

loaded fasta
loaded gtf
added counts
aggregated
combined
saved
got fractions
computed fold and saving

Tissue #0 - Sample #1

loaded fasta
loaded gtf
added counts
aggregated
combined
saved
got fractions
computed fold and saving

Tissue #0 - Sample #8

loaded fasta
loaded gtf
added counts
aggregated
combined
saved
got fractions
computed fold and saving

Tissue #1 - Sample #0

loaded fasta
loaded gtf
added counts
aggregated
combined
saved
got fractions
computed fold and saving

Tissue #1 - Sample #2

loaded fasta
loaded fa