In [1]:
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [None]:
# 1. begin by assinging tpms to each transcript (mean and std)
# 2. then proceed to select number of transcripts per sample locus based on not a mean and std but overall distrib (loc5?)
# 3. then for each sample the tpms will already be known and simpy need to output the np.random.normal()

In [2]:
# declarations
base_dir_data = "/ccb/salz8-1/avaraby/tx_noise/data/"
base_dir_out = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10/GTEx_aggs/"
out_dir = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_t3_s10/sim_samples/"

num_tissues = 3

gff3cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [3]:
gauss = pd.read_csv(base_dir_out+"res.sample_gauss")

assert len(gauss[gauss["real_mean"]>gauss["real_num"]])==0,"incorrect numbers in real"
assert len(gauss[gauss["splicing_mean"]>gauss["splicing_num"]])==0,"incorrect numbers in splicing"
assert len(gauss[gauss["intronic_mean"]>gauss["intronic_num"]])==0,"incorrect numbers in intronic"
assert len(gauss[gauss["polymerase_mean"]>gauss["polymerase_num"]])==0,"incorrect numbers in polymerase"

gauss_real = gauss[(gauss["real_num"]>0)][["real_num",\
                                            "total_real_num",\
                                            "real_mean_tpm",\
                                            "real_sd_tpm",\
                                            "splicing_num",\
                                            "total_splicing_num",\
                                            "splicing_mean_tpm",\
                                            "splicing_sd_tpm",\
                                            "intronic_num",\
                                            "total_intronic_num",\
                                            "intronic_mean_tpm",\
                                            "intronic_sd_tpm"]].reset_index(drop=True)
print("number of gaussians for real loci: "+str(len(gauss_real)))
gauss_pol = gauss[gauss["polymerase_num"]>0][["polymerase_num",\
                                              "total_polymerase_num",\
                                              "polymerase_mean_tpm",\
                                              "polymerase_sd_tpm"]].reset_index(drop=True)
print("number of gaussians for noise loci: "+str(len(gauss_pol)))
gauss_pol.head()

  interactivity=interactivity, compiler=compiler, result=result)


number of gaussians for real loci: 754393
number of gaussians for noise loci: 1331711


Unnamed: 0,polymerase_num,total_polymerase_num,polymerase_mean_tpm,polymerase_sd_tpm
0,1,1,0.312725,0
1,2,4,1.59298;0.439877,0;0
2,2,6,1.65269;0.803662,0.679609;0
3,3,23,0.467845;0.234847;0.891467,0;0;0
4,2,11,0.519612;0.212749,0;0


In [4]:
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    pol_baseDF_sub1 = pd.read_csv(out_dir+"stage1_tissue_locs.pol_t"+str(tissue_num))

    print("starting number of intergenic transcripts: "+str(len(pol_baseDF_sub1["tid"])))
    print("starting number of intergenic genes: "+str(len(set(pol_baseDF_sub1["lid"]))))

    # first thing we need to attach tpm means and averages to each of the tissue loci
    # based on the number of transcripts, which can employ replacement if necessary
    pol_g_lid = pol_baseDF_sub1.groupby(by="lid").agg({"tid":{"tids":lambda x:list(x),"count":"count"}}).reset_index()
    pol_g_lid.columns = ["lid","tids_pol","count_pol"]

    # now attach expression values based on exact match of the number of transcripts
    num_dropped = 0
    def cond_merge_pol(g,df):
        nt = int(g["count_pol"].iloc[0])
        sub = df[df["total_polymerase_num"]==nt]
        if not len(sub)>=len(g):
            print(g["lid"].iloc[0],nt,len(sub),len(g))
            global num_dropped
            num_dropped+=1
            return
        sub = sub.sample(n=len(g),replace=False).reset_index(drop=True)
        g2 = pd.concat([g.reset_index(drop=True),sub],axis=1)
        assert len(g2)==len(g),"uneven length"
        return g2

    pol_g_lid = pol_g_lid.groupby('count_pol').apply(cond_merge_pol,gauss_pol).reset_index(drop=True)
    print("number of polymerase tissue loci: "+str(len(pol_g_lid)))
    print("number of loci without a match: "+str(num_dropped))

    # only keep "polymerase_num" transcripts for the tissue
    def get_n_tx(row):
        return np.random.choice(row.tids_pol,row["polymerase_num"],replace=False)

    pol_g_lid["tids_pol"] = pol_g_lid.apply(lambda row: get_n_tx(row),axis=1)

    pol_g_lid["polymerase_mean_tpm"] = pol_g_lid.polymerase_mean_tpm.str.split(";")
    pol_g_lid["polymerase_sd_tpm"] = pol_g_lid.polymerase_sd_tpm.str.split(";")
    pol_g_lid = pol_g_lid[["lid",\
                           "tids_pol",\
                           "polymerase_mean_tpm",\
                           "polymerase_sd_tpm"]].set_index('lid').apply(lambda row: row.apply(pd.Series).stack()).reset_index().drop('level_1', 1)
    pol_g_lid.columns= ["lid","tid_polymerase","polymerase_mean_tpm","polymerase_sd_tpm"]
    pol_g_lid.to_csv(out_dir+"stage2_tid_lid_exp.pol_t"+str(tissue_num),index=False)
    print("total number of polymerase transcripts: "+str(len(pol_g_lid)))


Tissue #0

starting number of intergenic transcripts: 148704
starting number of intergenic genes: 29016


in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


number of polymerase tissue loci: 29016
number of loci without a match: 0
total number of polymerase transcripts: 42114

Tissue #1

starting number of intergenic transcripts: 182621
starting number of intergenic genes: 39563
number of polymerase tissue loci: 39563
number of loci without a match: 0
total number of polymerase transcripts: 54678

Tissue #2

starting number of intergenic transcripts: 123172
starting number of intergenic genes: 23659
number of polymerase tissue loci: 23659
number of loci without a match: 0
total number of polymerase transcripts: 33552


In [6]:
for tissue_num in range(num_tissues):
    print("\n=================\nTissue #"+str(tissue_num)+"\n=================\n")
    real_baseDF_sub1 = pd.read_csv(out_dir+"stage1_tissue_locs.real_t"+str(tissue_num))
    splice_baseDF_sub1 = pd.read_csv(out_dir+"stage1_tissue_locs.splice_t"+str(tissue_num))
    int_baseDF_sub1 = pd.read_csv(out_dir+"stage1_tissue_locs.int_t"+str(tissue_num))

    print("starting number of real transcripts: "+str(len(real_baseDF_sub1["tid"])))
    print("starting number of real genes: "+str(len(set(real_baseDF_sub1["lid"]))))

    print("starting number of splicing transcripts: "+str(len(splice_baseDF_sub1["tid"])))
    print("starting number of splicing genes: "+str(len(set(splice_baseDF_sub1["lid"]))))

    print("starting number of intronic transcripts: "+str(len(int_baseDF_sub1["tid"])))
    print("starting number of intronic genes: "+str(len(set(int_baseDF_sub1["lid"]))))

    # and we should group them not separately, but by performing a groupby method jointly between real,splicing and intronic
    real_g_lid = real_baseDF_sub1.groupby(by="lid").agg({"tid":{"tids":lambda x:list(x),"count":"count"}}).reset_index()#[["lid","tid"]]
    real_g_lid.columns = ["lid","tids_real","count_real"]

    splice_g_lid = splice_baseDF_sub1.groupby(by="lid").agg({"tid":{"tids":lambda x:list(x),"count":"count"}}).reset_index()#[["lid","tid"]]
    splice_g_lid.columns = ["lid","tids_splice","count_splice"]

    int_g_lid = int_baseDF_sub1.groupby(by="lid").agg({"tid":{"tids":lambda x:list(x),"count":"count"}}).reset_index()#[["lid","tid"]]
    int_g_lid.columns = ["lid","tids_int","count_int"]

    all_g_lid = real_g_lid.merge(splice_g_lid,how="left",on="lid")
    all_g_lid = all_g_lid.merge(int_g_lid,how="left",on="lid")
    all_g_lid["count_splice"] = all_g_lid["count_splice"].fillna(0)
    all_g_lid["count_int"] = all_g_lid["count_int"].fillna(0)

    print("total number of real transcripts: "+str(all_g_lid["count_real"].sum()))
    print("total number of splicing transcripts: "+str(all_g_lid["count_splice"].sum()))
    print("total number of intronic transcripts: "+str(all_g_lid["count_int"].sum()))

    # now attach expression values based on exact match of the number of transcripts
    num_dropped = 0
    def cond_merge_real(g,df):
        nt_real = int(g["count_real"].iloc[0])
        nt_splicing = int(g["count_splice"].iloc[0])
        nt_intronic = int(g["count_int"].iloc[0])
        sub = df[(df["total_real_num"]==nt_real)&\
                 (df["total_splicing_num"]==nt_splicing)&\
                 (df["total_intronic_num"]==nt_intronic)]
        if not len(sub)>=len(g):
            print(g["lid"].iloc[0],nt_real,nt_splicing,nt_intronic)
            global num_dropped
            num_dropped+=1
            return
        sub = sub.sample(n=len(g),replace=False).reset_index(drop=True)
        g2 = pd.concat([g.reset_index(drop=True),sub],axis=1)
        assert len(g2)==len(g),"uneven length"
        return g2

    all_g_lid = all_g_lid.groupby(["count_real","count_splice","count_int"]).apply(cond_merge_real,gauss_real).reset_index(drop=True)
    print("number of real tissue loci: "+str(len(all_g_lid)))
    print("number of real loci without a match: "+str(num_dropped))

    # only keep "polymerase_num" transcripts for the tissue
    def get_n_tx(row):
        tids_real = row["tids_real"]
        new_tids_real = []
        if not row["count_real"]==0:
            new_tids_real = np.random.choice(tids_real,row["real_num"],replace=False)

        tids_splice = row["tids_splice"]
        new_tids_splice = []
        if not row["count_splice"]==0:
            new_tids_splice = np.random.choice(tids_splice,row["splicing_num"],replace=False)

        tids_int = row["tids_int"]
        new_tids_int = []
        if not row["count_int"]==0:
            new_tids_int = np.random.choice(tids_int,row["intronic_num"],replace=False)

        return new_tids_real,new_tids_splice,new_tids_int

    all_g_lid[["tids_real","tids_splice","tids_int"]] = all_g_lid.apply(lambda row: get_n_tx(row),axis=1,result_type="expand")
    
    all_g_lid["real_mean_tpm"] = all_g_lid.real_mean_tpm.str.split(";")
    all_g_lid["real_sd_tpm"] = all_g_lid.real_sd_tpm.str.split(";")
    all_g_lid["splicing_mean_tpm"] = all_g_lid.splicing_mean_tpm.str.split(";")
    all_g_lid["splicing_sd_tpm"] = all_g_lid.splicing_sd_tpm.str.split(";")
    all_g_lid["intronic_mean_tpm"] = all_g_lid.intronic_mean_tpm.str.split(";")
    all_g_lid["intronic_sd_tpm"] = all_g_lid.intronic_sd_tpm.str.split(";")
    all_g_lid = all_g_lid[["lid",\
                           "tids_real",\
                           "tids_splice",\
                           "tids_int",\
                           "real_mean_tpm",\
                           "real_sd_tpm",\
                           "splicing_mean_tpm",\
                           "splicing_sd_tpm",\
                           "intronic_mean_tpm",\
                           "intronic_sd_tpm"]].set_index('lid').apply(lambda row: row.apply(pd.Series).stack()).reset_index().drop('level_1', 1)
    real_g_lid = all_g_lid[["lid",
                            "tids_real",
                            "real_mean_tpm",
                            "real_sd_tpm"]]
    real_g_lid.columns= ["lid","tid_real","real_mean_tpm","real_sd_tpm"]
    real_g_lid = real_g_lid[~(real_g_lid["tid_real"].isna())]
    real_g_lid.to_csv(out_dir+"stage2_tid_lid_exp.real_t"+str(tissue_num),index=False)

    splice_g_lid = all_g_lid[["lid",
                            "tids_splice",
                            "splicing_mean_tpm",
                            "splicing_sd_tpm"]]
    splice_g_lid.columns= ["lid","tid_splicing","splicing_mean_tpm","splicing_sd_tpm"]
    splice_g_lid = splice_g_lid[~(splice_g_lid["tid_splicing"].isna())]
    splice_g_lid.to_csv(out_dir+"stage2_tid_lid_exp.splice_t"+str(tissue_num),index=False)

    int_g_lid = all_g_lid[["lid",
                            "tids_int",
                            "intronic_mean_tpm",
                            "intronic_sd_tpm"]]
    int_g_lid.columns= ["lid","tid_intronic","intronic_mean_tpm","intronic_sd_tpm"]
    int_g_lid = int_g_lid[~(int_g_lid["tid_intronic"].isna())]
    int_g_lid.to_csv(out_dir+"stage2_tid_lid_exp.int_t"+str(tissue_num),index=False)

    print("total number of real transcripts: "+str(len(real_g_lid)))
    print("total number of real genes: "+str(len(set(real_g_lid["lid"]))))

    print("total number of splicing transcripts: "+str(len(splice_g_lid)))
    print("total number of splicing genes: "+str(len(set(splice_g_lid["lid"]))))

    print("total number of intronic transcripts: "+str(len(int_g_lid)))
    print("total number of intronic genes: "+str(len(set(int_g_lid["lid"]))))


Tissue #0

starting number of real transcripts: 234697
starting number of real genes: 31218
starting number of splicing transcripts: 8916670
starting number of splicing genes: 30323
starting number of intronic transcripts: 4507952
starting number of intronic genes: 21180
total number of real transcripts: 234697
total number of splicing transcripts: 8916670.0
total number of intronic transcripts: 4507952.0
number of real tissue loci: 31218
number of real loci without a match: 0
total number of real transcripts: 123950
total number of real genes: 31218
total number of splicing transcripts: 977436
total number of splicing genes: 25798
total number of intronic transcripts: 351038
total number of intronic genes: 14499

Tissue #1

starting number of real transcripts: 223113
starting number of real genes: 29592
starting number of splicing transcripts: 8510062
starting number of splicing genes: 28777
starting number of intronic transcripts: 4339048
starting number of intronic genes: 20000
tot