In [1]:
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [2]:
# declarations
base_dir_data = "/ccb/salz8-1/avaraby/tx_noise/data/"
base_dir_out = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10_26022020/GTEx_aggs/"
out_dir = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10_26022020/sim_samples/"

num_tissues = 3
num_samples = 10

readlen = 101

gff3cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [3]:
# Here we need to implement a method for converting TPM information into
# coverage information for polyester
# we can do this by following the REM implementation of the TPM to number of reads conversion

# first load the distribution of the number of reads per sample
readlen_stats = pd.read_csv("/ccb/salz8-1/avaraby/tx_noise/readlen.stats",usecols=["readlen"])

In [4]:
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    for sample_num in range(num_samples):
        print("++++++\n>Sample #"+str(sample_num)+"\n++++++\n")
        total_nreads = np.random.normal(readlen_stats["readlen"].mean(),readlen_stats["readlen"].std())
        print("number of reads in sample is: "+str(total_nreads))

        real = pd.read_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",sep="\t",names=gff3cols)
        real["tid"] = real["attributes"].str.split("transcript_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        real["gid"] = real["attributes"].str.split("gene_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        realt = real[real["type"]=="transcript"][["tid","gid"]].reset_index(drop=True) # intended for order
        reale = real[real["type"]=="exon"].reset_index(drop=True)
        reale["elen"] = reale["end"]-reale["start"]
        reale = reale[["tid","elen"]]
        reale = reale.groupby("tid").agg({"elen":"sum"}).reset_index()
        assert set(realt["tid"])==set(reale["tid"]),"number of transcripts is not the same as number of groupped exons"
        reale = realt.merge(reale,how="left",on="tid")
        tpms = pd.read_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        assert len(tpms)==len(reale),"number of tpms different from the number of transcripts"
        reale["tpm"] = tpms["tpm"]

        splicing = pd.read_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",sep="\t",names=gff3cols)
        splicing["tid"] = splicing["attributes"].str.split("transcript_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        splicing["gid"] = splicing["attributes"].str.split("gene_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        splicingt = splicing[splicing["type"]=="transcript"][["tid","gid"]].reset_index(drop=True) # intended for order
        splicinge = splicing[splicing["type"]=="exon"].reset_index(drop=True)
        splicinge["elen"] = splicinge["end"]-splicinge["start"]
        splicinge = splicinge[["tid","elen"]]
        splicinge = splicinge.groupby("tid").agg({"elen":"sum"}).reset_index()
        assert set(splicingt["tid"])==set(splicinge["tid"]),"number of transcripts is not the same as number of groupped exons"
        splicinge = splicingt.merge(splicinge,how="left",on="tid")
        tpms = pd.read_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        assert len(tpms)==len(splicinge),"number of tpms different from the number of transcripts"
        splicinge["tpm"] = tpms["tpm"]

        intronic = pd.read_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",sep="\t",names=gff3cols)
        intronic["tid"] = intronic["attributes"].str.split("transcript_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        intronic["gid"] = intronic["attributes"].str.split("gene_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        intronict = intronic[intronic["type"]=="transcript"][["tid","gid"]].reset_index(drop=True) # intended for order
        intronice = intronic[intronic["type"]=="exon"].reset_index(drop=True)
        intronice["elen"] = intronice["end"]-intronice["start"]
        intronice = intronice[["tid","elen"]]
        intronice = intronice.groupby("tid").agg({"elen":"sum"}).reset_index()
        assert set(intronict["tid"])==set(intronice["tid"]),"number of transcripts is not the same as number of groupped exons"
        intronice = intronict.merge(intronice,how="left",on="tid")
        tpms = pd.read_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        assert len(tpms)==len(intronice),"number of tpms different from the number of transcripts"
        intronice["tpm"] = tpms["tpm"]

        pol = pd.read_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".gtf",sep="\t",names=gff3cols)
        pol["tid"] = pol["attributes"].str.split("transcript_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        pol["gid"] = pol["attributes"].str.split("gene_id \"",expand=True)[1].str.split("\"",expand=True)[0]
        polt = pol[pol["type"]=="transcript"][["tid","gid"]].reset_index(drop=True) # intended for order
        pole = pol[pol["type"]=="exon"].reset_index(drop=True)
        pole["elen"] = pole["end"]-pole["start"]
        pole = pole[["tid","elen"]]
        pole = pole.groupby("tid").agg({"elen":"sum"}).reset_index()
        assert set(polt["tid"])==set(pole["tid"]),"number of transcripts is not the same as number of groupped exons"
        pole = polt.merge(pole,how="left",on="tid")

        tpms = pd.read_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".exp",names=["tpm"])
        assert len(tpms)==len(pole),"number of tpms different from the number of transcripts"
        pole["tpm"] = tpms["tpm"]
        joined = pd.concat([reale[["tid","elen","tpm"]],splicinge[["tid","elen","tpm"]],intronice[["tid","elen","tpm"]],pole[["tid","elen","tpm"]]],axis=0).reset_index(drop=True)
        joined["theta"] = joined["elen"]*joined["tpm"]
        denom = joined["theta"].sum()
        joined["cor"] = joined["theta"]/denom
        # now that we have all these values, we 
        joined["cov"] = (joined["cor"]*total_nreads*readlen)/joined["elen"]
        # now we can merge the data to comply with the original ordering
        # and proceed to write it out
        realt.merge(joined[["tid","cov"]],how="left",on="tid")[["cov"]].to_csv(out_dir+"real.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",index=False,header=False)
        splicingt.merge(joined[["tid","cov"]],how="left",on="tid")[["cov"]].to_csv(out_dir+"splicing.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",index=False,header=False)
        intronict.merge(joined[["tid","cov"]],how="left",on="tid")[["cov"]].to_csv(out_dir+"intronic.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",index=False,header=False)
        polt.merge(joined[["tid","cov"]],how="left",on="tid")[["cov"]].to_csv(out_dir+"intergenic.t"+str(tissue_num)+"_s"+str(sample_num)+".cov",index=False,header=False)


Tissue #0

++++++
>Sample #0
++++++

number of reads in sample is: 101738610.05267522
++++++
>Sample #1
++++++

number of reads in sample is: 98569602.98587418
++++++
>Sample #2
++++++

number of reads in sample is: 137750021.2740215
++++++
>Sample #3
++++++

number of reads in sample is: 68749304.48132825
++++++
>Sample #4
++++++

number of reads in sample is: 115624138.52164288
++++++
>Sample #5
++++++

number of reads in sample is: 90354803.03818865
++++++
>Sample #6
++++++

number of reads in sample is: 117620128.12841591
++++++
>Sample #7
++++++

number of reads in sample is: 36547972.27686375
++++++
>Sample #8
++++++

number of reads in sample is: 180693854.62017268
++++++
>Sample #9
++++++

number of reads in sample is: 64676948.234055206

Tissue #1

++++++
>Sample #0
++++++

number of reads in sample is: 98500070.72957925
++++++
>Sample #1
++++++

number of reads in sample is: 76826927.76816139
++++++
>Sample #2
++++++

number of reads in sample is: 139347922.52277154
++++++
>