In [1]:
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import glob
import math
import csv
import sys
import os

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

In [2]:
# declarations
base_dir_data = "/ccb/salz8-1/avaraby/tx_noise/data/"
base_dir_out = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_25022020/GTEx_aggs/"
out_dir = "/ccb/salz8-1/avaraby/tx_noise/full_analysis_25022020/sim_samples/"
hg38_fa = "/home/avaraby1/genomes/human/hg38/hg38_p8.fa"

gff3cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
tmapcols=["ref_gene_id","ref_id","class_code","qry_gene_id","qry_id","num_exons","FPKM","TPM","cov","len","major_iso_id","ref_match_len"]

In [9]:
# first need to cluster the loci
cluster_cmd = "gffread """+base_dir_data+"""ALL.sub.gtf --cluster-only -T -o """+base_dir_data+"""ALL.merged.gtf"""
subprocess.call(cluster_cmd,shell=True)

# next run the comparison of ALL with CHESS
compare_cmd = "gffcompare -o """+base_dir_data+"""chess2ALL -r """+base_dir_data+"""chess2.2_assembly.gff """+base_dir_data+"""ALL.merged.gtf"""
subprocess.call(compare_cmd,shell=True)

# next we can isolate the intergenic noise
# get all the real loci codes
realgtf_cmd = """awk -F '\t|\"' 'FNR==NR{ids[$1]++;next} {if($10 in ids) print $0}' <(awk -v pat='=' -F '\t' '$3~pat {print $5}' """+base_dir_data+"""chess2ALL.ALL.merged.gtf.tmap) """+base_dir_data+"""ALL.merged.gtf > """+base_dir_data+"""real.gtf"""
subprocess.call(realgtf_cmd,shell=True)

# next get a list of all loci that contain real transcripts
realloci_cmd = """awk -F '\t|locus "' '$3=="transcript" {print $10}' """+base_dir_data+"""real.gtf | cut -d'"' -f1 | sort | uniq > """+base_dir_data+"""real.locs"""
subprocess.call(realloci_cmd,shell=True)

# extract all transcript IDs that are in the set of real loci
realtids_cmd = """grep -w -F -f """+base_dir_data+"""real.locs """+base_dir_data+"""ALL.merged.gtf | awk -F '\t|\"' '$3=="transcript" {print $10}' | sort | uniq > """+base_dir_data+"""real_loc.all_tids"""
subprocess.call(realtids_cmd.split(" "))

# extract all transcript IDs
alltids_cmd = """awk -F '\t|\"' '$3=="transcript" {print $10}' """+base_dir_data+"""ALL.merged.gtf | sort | uniq > """+base_dir_data+"""all.tids"""
subprocess.call(alltids_cmd,shell=True)

# get tids of potentially RNApol
poltids_cmd = """diff --new-line-format="" --unchanged-line-format=""  """+base_dir_data+"""all.tids """+base_dir_data+"""real_loc.all_tids > """+base_dir_data+"""RNApol.tids"""
subprocess.call(poltids_cmd,shell=True)

# get related RNApol tmap
poltmap_cmd = """grep -w -F -f """+base_dir_data+"""RNApol.tids """+base_dir_data+"""chess2ALL.ALL.merged.gtf.tmap > """+base_dir_data+"""RNApol.tmap"""
subprocess.call(poltmap_cmd.split(" "))

# next, everything that is not in this set is RNApol noise
polgtf_cmd = """awk -F '\t|\"' 'FNR==NR{ids[$1]++;next} {if($10 in ids) print $0}' <(awk -v pat='u' -F '\t' '$3~pat {print $5}' RNApol.tmap) ALL.merged.gtf > """+base_dir_data+"""RNApol.gtf"""
subprocess.call(polgtf_cmd,shell=True)

# next we need to separate out intronic and splicing noise
# first get a set of real tids
noise_cmd="""awk -F '\t|\"' '$3=="transcript" {print $10}' """+base_dir_data+"""real.gtf | sort | uniq > """+base_dir_data+"""real.tids"""
subprocess.call(noise_cmd,shell=True)

# next we can subtract this from the set of all tids in real locs
all_cmd = """diff --new-line-format="" --unchanged-line-format=""  """+base_dir_data+"""real_loc.all_tids """+base_dir_data+"""real.tids > """+base_dir_data+"""real_loc.nonreal_tids"""
subprocess.call(all_cmd,shell=True)

# now we can generate the corresponding tmap file
tmap_cmd = """grep -w -F -f """+base_dir_data+"""real_loc.nonreal_tids """+base_dir_data+"""chess2ALL.ALL.merged.gtf.tmap > """+base_dir_data+"""nonreal.tmap"""
subprocess.call(tmap_cmd,shell=True)

1

In [3]:
# now we can focus on figuring out how to extract real transcripts/loci
# and adding noise transcripts to them

# begin by extracting a list of chess IDs of the transcripts that match from ALL to CHESS
# and convert chess transcript IDs to gene IDs
chess_realloc_cmd = """awk -v pat='=' -F '\t' '$3~pat {print}' """+base_dir_data+"""chess2ALL.ALL.merged.gtf.tmap > """+base_dir_data+"real.chess.locs"
subprocess.call(chess_realloc_cmd,shell=True)

0

In [4]:
# extract a subset of CHESS comprised of these loci only - this will serve as a truth set

# first read in all the chess geneIDs
real_tm = pd.read_csv(base_dir_data+"real.chess.locs",names=tmapcols,sep="\t",usecols=["ref_id","qry_id"])
# only include those with CHS type IDs in the matching reference transcript
real_tm = real_tm[real_tm["ref_id"].str.contains("CHS\.[0-9]*\.[0-9]*")].reset_index(drop=True)
real_tm["gid"] = real_tm["ref_id"].str.split("\.",expand=True)[1].astype(int)
# convert chess GFF with matching transcript to GTF as real.gtf
assert len(real_tm[real_tm.duplicated("qry_id",keep="first")])==0,"duplicated ALL IDs in real"
real_tm.drop_duplicates("ref_id",keep="first",inplace=True)
assert len(real_tm[real_tm.duplicated("ref_id",keep="first")])==0,"duplicated CHESS IDs in real"
print("number of transcripts in real: "+str(len(set(real_tm["ref_id"]))))
real_tm.to_csv(base_dir_data+"real_chess2all.tgids",index=False)
real_tm.head()

number of transcripts in real: 87659


Unnamed: 0,ref_id,qry_id,gid
0,CHS.2.1,ALL_00012961,2
1,CHS.6.1,ALL_00000003,6
2,CHS.8.1,ALL_00012984,8
3,CHS.7.28,ALL_00012993,7
4,CHS.15.1,ALL_00013034,15


In [5]:
chessDF = pd.read_csv(base_dir_data+"chess2.2_assembly.gff",sep="\t",comment="#",names=gff3cols)
chess_types = [x for x in set(chessDF["type"]) if x not in ["gene","CDS"]]
print(chess_types)
chessDF = chessDF[chessDF["type"].isin(chess_types)].reset_index(drop=True)
chessDF["gid"] = chessDF["attributes"].str.split("CHS.",expand=True,n=1)[1].str.split("\.|;",expand=True,n=1)[0].astype(int)
chessDF["tid"] = np.where(chessDF["type"]!="exon",
            chessDF["attributes"].str.split("ID=",expand=True,n=1)[1].str.split(";",expand=True,n=1)[0],
            chessDF["attributes"].str.split("Parent=",expand=True,n=1)[1].str.split(";",expand=True,n=1)[0])
chessDF = chessDF[chessDF["gid"].isin(real_tm["gid"])].reset_index(drop=True)
chessDF = chessDF.merge(real_tm,left_on="tid",right_on="ref_id",how="right",indicator=True)
assert len(chessDF[chessDF["_merge"]=="both"])==len(chessDF),"not all are merged for real"
assert len(real_tm["ref_id"])==len(set(chessDF["tid"])),"length of the dataframe has changed in real"
chessDF["attributes"] = "transcript_id \""+chessDF["tid"]+\
                        "\"; gene_id \"CHS."+chessDF["gid_x"].astype(str)+\
                        "\"; old_transcript_id_old \""+chessDF["qry_id"]+"\";"
chessDF[gff3cols].to_csv(base_dir_data+"chess_real_matches.gtf",header=False,index=False,sep="\t",quoting=csv.QUOTE_NONE)
chessDF

['rRNA', 'exon', 'transcript', 'tRNA']


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,gid_x,tid,ref_id,qry_id,gid_y,_merge
0,chr1,BestRefSeq,transcript,14362,29370,.,-,.,"transcript_id ""CHS.2.1""; gene_id ""CHS.2""; old_...",2,CHS.2.1,CHS.2.1,ALL_00012961,2,both
1,chr1,BestRefSeq,exon,14362,14829,.,-,.,"transcript_id ""CHS.2.1""; gene_id ""CHS.2""; old_...",2,CHS.2.1,CHS.2.1,ALL_00012961,2,both
2,chr1,BestRefSeq,exon,14970,15038,.,-,.,"transcript_id ""CHS.2.1""; gene_id ""CHS.2""; old_...",2,CHS.2.1,CHS.2.1,ALL_00012961,2,both
3,chr1,BestRefSeq,exon,15796,15947,.,-,.,"transcript_id ""CHS.2.1""; gene_id ""CHS.2""; old_...",2,CHS.2.1,CHS.2.1,ALL_00012961,2,both
4,chr1,BestRefSeq,exon,16607,16765,.,-,.,"transcript_id ""CHS.2.1""; gene_id ""CHS.2""; old_...",2,CHS.2.1,CHS.2.1,ALL_00012961,2,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008298,chrY,BestRefSeq,exon,56954255,56954656,.,+,.,"transcript_id ""CHS.59383.1""; gene_id ""CHS.5938...",59383,CHS.59383.1,CHS.59383.1,ALL_00441799,59383,both
1008299,chrY,BestRefSeq,exon,56960286,56968979,.,+,.,"transcript_id ""CHS.59383.1""; gene_id ""CHS.5938...",59383,CHS.59383.1,CHS.59383.1,ALL_00441799,59383,both
1008300,chrY,Gnomon,transcript,57201135,57203577,.,-,.,"transcript_id ""CHS.59386.1""; gene_id ""CHS.5938...",59386,CHS.59386.1,CHS.59386.1,ALL_00441903,59386,both
1008301,chrY,Gnomon,exon,57201135,57202145,.,-,.,"transcript_id ""CHS.59386.1""; gene_id ""CHS.5938...",59386,CHS.59386.1,CHS.59386.1,ALL_00441903,59386,both


In [6]:
# next we need to perform the same operation for the splicing noise
# first need to also retrieve chess IDs
chess_spliceloc_cmd = """awk -v pat='c|k|m|n|j|e' -F '\t' '$3~pat {print}' """+base_dir_data+"""nonreal.tmap > """+base_dir_data+"splice.chess.locs"
subprocess.call(chess_spliceloc_cmd,shell=True)

0

In [7]:
# now need to obtain a mapping from chess to ALL for the splicing noise transcripts
sp_tm = pd.read_csv(base_dir_data+"splice.chess.locs",names=tmapcols,sep="\t",usecols=["ref_id","qry_id"])
# only include those that belong to valid transcripts in CHESS
sp_tm = sp_tm[sp_tm["ref_id"].isin(real_tm["ref_id"])].reset_index(drop=True)
# only include those with CHS type IDs in the matching reference transcript
sp_tm = sp_tm[sp_tm["ref_id"].str.contains("CHS\.[0-9]*\.[0-9]*")].reset_index(drop=True)
sp_tm["gid"] = sp_tm["ref_id"].str.split("\.",expand=True)[1].astype(int)
# make sure the IDs exist in the set of real genes
sp_tm = sp_tm[sp_tm["gid"].isin(real_tm["gid"])]
sp_tm.to_csv(base_dir_data+"splicing_chess2all.tgids",index=False)
sp_tm.head()

Unnamed: 0,ref_id,qry_id,gid
0,CHS.16.3,ALL_00034985,16
1,CHS.24.2,ALL_00013071,24
2,CHS.26.2,ALL_00000025,26
3,CHS.26.2,ALL_00000026,26
4,CHS.26.2,ALL_00000030,26


In [8]:
# next we need to perform the same operation for the intronic noise
# first need to also retrieve chess IDs
chess_intronloc_cmd = """awk -v pat='i' -F '\t' '$3~pat {print}' """+base_dir_data+"""nonreal.tmap > """+base_dir_data+"intron.chess.locs"
subprocess.call(chess_intronloc_cmd,shell=True)

0

In [9]:
# now need to obtain a mapping from chess to ALL for the intronic noise transcripts
int_tm = pd.read_csv(base_dir_data+"intron.chess.locs",names=tmapcols,sep="\t",usecols=["ref_id","qry_id"])
# only include those that belong to valid transcripts in CHESS
int_tm = int_tm[int_tm["ref_id"].isin(real_tm["ref_id"])].reset_index(drop=True)
# only include those with CHS type IDs in the matching reference transcript
int_tm = int_tm[int_tm["ref_id"].str.contains("CHS\.[0-9]*\.[0-9]*")].reset_index(drop=True)
int_tm["gid"] = int_tm["ref_id"].str.split("\.",expand=True)[1].astype(int)
int_tm.to_csv(base_dir_data+"intronic_chess2all.tgids",index=False)
int_tm.head()

Unnamed: 0,ref_id,qry_id,gid
0,CHS.41.9,ALL_00028752,41
1,CHS.138.5,ALL_00013566,138
2,CHS.146.7,ALL_00013616,146
3,CHS.161.1,ALL_00000580,161
4,CHS.180.8,ALL_00000614,180


In [10]:
all_qry_ids = set(real_tm["qry_id"]).union(set(sp_tm["qry_id"]).union(int_tm["qry_id"]))
len(all_qry_ids)

162177

In [11]:
# need to build a dictionary from qry_id to ref_id
all_tm = pd.concat([real_tm,sp_tm,int_tm],axis=0)
id_dict = pd.Series(all_tm.ref_id.values,index=all_tm.qry_id).to_dict()
real_dict = pd.Series(real_tm.ref_id.values,index=real_tm.qry_id).to_dict()
sp_dict = pd.Series(sp_tm.ref_id.values,index=sp_tm.qry_id).to_dict()
int_dict = pd.Series(int_tm.ref_id.values,index=int_tm.qry_id).to_dict()

In [12]:
assert set(id_dict)==set(all_tm["qry_id"]),"qry sets not identical"
assert set(id_dict.values())==set(all_tm["ref_id"]),"ref sets not identical"
assert len(list(id_dict))==len(list(all_tm["qry_id"].tolist())),"qry lists of different lengths"
assert len(list(id_dict.values()))==len(list(all_tm["ref_id"].tolist())),"ref lists of different lengths"

In [13]:
count=0

real_gtf_fp = open(base_dir_data+"real_1.gtf","w+")
sp_gtf_fp = open(base_dir_data+"splicing_1.gtf","w+")
int_gtf_fp = open(base_dir_data+"intronic_1.gtf","w+")

with open(base_dir_data+"ALL.merged.gtf","r") as inFP:
    for line in inFP.readlines():
        line = line.strip()
        lineCols = line.split("\t")
        count+=1
        if count%1000000==0:
            print(count/1000000)
            
        # get transcript and gene IDs
        attrs = lineCols[8].split("\"")
        tid = attrs[1]
        gid = attrs[3]
        ntid = ""
        ngid = ""
        # now find the corresponding chess IDs
        if not tid in id_dict: # nothing found
            continue
            
        if tid in real_dict:
            ntid = real_dict[tid]
            ngid = "CHS."+ntid.split(".")[1]
            new_line = "\t".join(lineCols[:8])\
                             +"\ttranscript_id \""+ntid\
                             +"\"; gene_id \""+ngid\
                             +"\"; old_transcript_id_old \""+tid\
                             +"\"; old_gene_id \""+gid+"\";\n"
#             print(new_line)
            real_gtf_fp.write(new_line)
            
        
        elif tid in sp_dict:
            ntid = sp_dict[tid]
            ngid = "CHS."+ntid.split(".")[1]
            new_line = "\t".join(lineCols[:8])\
                             +"\ttranscript_id \""+tid\
                             +"\"; gene_id \""+ngid\
                             +"\"; old_transcript_id_old \""+tid\
                             +"\"; old_gene_id \""+gid+"\";\n"
#             print(new_line)
            sp_gtf_fp.write(new_line)
        
        elif tid in int_dict:
            ntid = int_dict[tid]
            ngid = "CHS."+ntid.split(".")[1]
            new_line = "\t".join(lineCols[:8])\
                             +"\ttranscript_id \""+tid\
                             +"\"; gene_id \""+ngid\
                             +"\"; old_transcript_id_old \""+tid\
                             +"\"; old_gene_id \""+gid+"\";\n"
#             print(new_line)
            int_gtf_fp.write(new_line)
            
        else: # nothing found
            print("ERROR")
            break;
            
#         break;
        
real_gtf_fp.close()
sp_gtf_fp.close()
int_gtf_fp.close()

1.0
2.0


In [15]:
# lastly need to concatenate all 4 to create a new set of transcripts
# interesting to know how many remain in total
cat_cmd = "cat "+base_dir_data+"real_1.gtf "+base_dir_data+"splicing_1.gtf "+base_dir_data+"intronic_1.gtf "+base_dir_data+"RNApol_1.gtf > "+base_dir_data+"ALL.merged.filtered.gtf"
subprocess.call(cat_cmd,shell=True)

0

In [16]:
# now we need to modify intergenic set to replcae all undefined strands with real strand information
polDF = pd.read_csv(base_dir_data+"RNApol_1.gtf",sep="\t",comment="#",names=gff3cols)
polDF["tid"] = polDF["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
undef_strand_tids = polDF[(polDF["type"]=="transcript")&(polDF["strand"]==".")].tid.tolist()
def_strand_tids = polDF[(polDF["type"]=="transcript")&~(polDF["strand"]==".")].tid.tolist()
assert len(undef_strand_tids)==len(set(undef_strand_tids)),"duplicates in undefined"
assert len(def_strand_tids)==len(set(def_strand_tids)),"duplicates in defined"
assert len(set(undef_strand_tids).intersection(set(def_strand_tids)))==0,"same transcripts in both groups"
random_strands = random.choices(["+","-"],k=len(undef_strand_tids))
undef_strand_df = pd.concat([pd.DataFrame(undef_strand_tids,columns=["tid"]),\
                             pd.DataFrame(random_strands,columns=["strand_new"])],axis=1)
polDF = polDF.merge(undef_strand_df,on="tid",how="left")
polDF["strand"] = np.where(polDF["strand"]==".",polDF["strand_new"],polDF["strand"])
polDF[gff3cols].to_csv(base_dir_data+"RNApol.gtf",header=False,index=False,sep="\t",quoting=csv.QUOTE_NONE)

In [17]:
# first we need to load the RNApol set in order to know which transcripts to output for that set
polDF = pd.read_csv(base_dir_data+"RNApol.gtf",sep="\t",comment="#",names=gff3cols)
polDF = polDF[polDF["type"]=="transcript"].reset_index(drop=True)
polDF["tid"] = polDF["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
pol_set = set(polDF["tid"])

In [18]:
# now we need to modify any other files with the updated information
# namely, we need to update the tracking file with the updated information
count=0

real_tr_fp = open(base_dir_data+"real.tracking","w+")
sp_tr_fp = open(base_dir_data+"splicing.tracking","w+")
int_tr_fp = open(base_dir_data+"intronic.tracking","w+")
pol_tr_fp = open(base_dir_data+"RNApol.tracking","w+")

with open(base_dir_data+"ALL.tracking","r") as inFP:
    for line in inFP.readlines():
        line = line.strip()
        lineCols = line.split("\t")
        count+=1
        if count%1000000==0:
            print(count/1000000)
            
        # get transcript and gene IDs
        tid = lineCols[0]
        # now find the corresponding chess IDs
        if not tid in id_dict and not tid in pol_set: # nothing found
            continue
            
        if tid in real_dict:
            ntid = real_dict[tid]
            ngid = "CHS."+ntid.split(".")[1]
            new_line = ntid+"\t"+ngid+"\t"+"\t".join(lineCols[2:])+"\n"
#             print(new_line)
            real_tr_fp.write(new_line)
            
        
        elif tid in sp_dict:
            ntid = sp_dict[tid]
            ngid = "CHS."+ntid.split(".")[1]
            new_line = tid+"\t"+ngid+"\t"+"\t".join(lineCols[2:])+"\n"
#             print(new_line)
            sp_tr_fp.write(new_line)
        
        elif tid in int_dict:
            ntid = int_dict[tid]
            ngid = "CHS."+ntid.split(".")[1]
            new_line = tid+"\t"+ngid+"\t"+"\t".join(lineCols[2:])+"\n"
#             print(new_line)
            int_tr_fp.write(new_line)
    
        elif tid in pol_set:
            pol_tr_fp.write(line+"\n")
            
        else: # nothing found
            print("ERROR")
            break;
            
#         break;
        
real_tr_fp.close()
sp_tr_fp.close()
int_tr_fp.close()
pol_tr_fp.close()

In [19]:
# lastly need to concatenate all 4 to create a new tracking file
cat_tr_cmd = "cat "+base_dir_data+"real.tracking "+\
          base_dir_data+"splicing.tracking "+\
          base_dir_data+"intronic.tracking "+\
          base_dir_data+"RNApol.tracking > "+base_dir_data+"ALL.merged.filtered.tracking"
subprocess.call(cat_tr_cmd,shell=True)

0

In [20]:
# deal with missing strand for real loci now
# load base annotations
print(">>>loading base annotations")
real_baseDF = pd.read_csv(base_dir_data+"real_1.gtf",sep="\t",names=gff3cols)
splice_baseDF = pd.read_csv(base_dir_data+"splicing_1.gtf",sep="\t",names=gff3cols)
int_baseDF = pd.read_csv(base_dir_data+"intronic_1.gtf",sep="\t",names=gff3cols)

# get all loci and transcript IDs
print(">>>getting loci IDs")
real_baseDF["lid"] = real_baseDF["attributes"].str.split("gene_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
splice_baseDF["lid"] = splice_baseDF["attributes"].str.split("gene_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
int_baseDF["lid"] = int_baseDF["attributes"].str.split("gene_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]

>>>loading base annotations
>>>getting loci IDs


In [21]:
chessDF = pd.read_csv(base_dir_data+"chess2.2_assembly.gff",sep="\t",comment="#",names=gff3cols)
chessDF[chessDF["type"]=="gene"].reset_index(drop=True)
chessDF["lid"] = "CHS."+chessDF["attributes"].str.split("CHS.",expand=True,n=1)[1].str.split("\.|;",expand=True,n=1)[0]
chessDF.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,lid
0,chr1,RefSeq,gene,11874,14409,.,+,.,ID=CHS.1;GENE_TYPE=misc_RNA;STATUS=known_refse...,CHS.1
1,chr1,BestRefSeq,transcript,11874,14409,.,+,.,ID=CHS.1.1;Parent=CHS.1;gene_name=DDX11L1;Dbxr...,CHS.1
2,chr1,BestRefSeq,exon,11874,12227,.,+,.,Parent=CHS.1.1,CHS.1
3,chr1,BestRefSeq,exon,12613,12721,.,+,.,Parent=CHS.1.1,CHS.1
4,chr1,BestRefSeq,exon,13221,14409,.,+,.,Parent=CHS.1.1,CHS.1


In [22]:
print("getting real strands")
st_df = chessDF[["lid","strand"]].drop_duplicates(keep="first")
assert set(st_df["strand"])==set(["+","-"]),"wrong strands"
st_df.columns = ["lid","strand_new"]

getting real strands


In [23]:
print("setting real")
real_baseDF2=real_baseDF.merge(st_df,on="lid",how="inner")
real_baseDF2["strand"] = real_baseDF2["strand_new"]
assert set(real_baseDF2["strand"]==set(["+","-"])),"wrong strands real"

print("setting intronic")
int_baseDF2=int_baseDF.merge(st_df,on="lid",how="inner")
int_baseDF2["strand"] = int_baseDF2["strand_new"]
assert set(int_baseDF2["strand"])==set(["+","-"]),"wrong strands int"

print("setting splicing")
splice_baseDF2=splice_baseDF.merge(st_df,on="lid",how="inner")
splice_baseDF2["strand"] = splice_baseDF2["strand_new"]
assert set(splice_baseDF2["strand"])==set(["+","-"]),"wrong strands splice"

setting real
setting intronic
setting splicing


In [28]:
# mv_cmd = """mv """+base_dir_data+"""splicing.gtf """+base_dir_data+"""splicing_old.gtf"""
# subprocess.call(mv_cmd,shell=True)
# mv_cmd = """mv """+base_dir_data+"""intronic.gtf """+base_dir_data+"""intronic_old.gtf"""
# subprocess.call(mv_cmd,shell=True)
mv_cmd = """mv """+base_dir_data+"""ALL.merged.filtered.gtf """+base_dir_data+"""ALL.merged.filtered_old.gtf"""
subprocess.call(mv_cmd,shell=True)

real_baseDF2[gff3cols].to_csv(base_dir_data+"real.gtf",header=False,index=False,sep="\t",quoting=csv.QUOTE_NONE)
splice_baseDF2[gff3cols].to_csv(base_dir_data+"splicing.gtf",header=False,index=False,sep="\t",quoting=csv.QUOTE_NONE)
int_baseDF2[gff3cols].to_csv(base_dir_data+"intronic.gtf",header=False,index=False,sep="\t",quoting=csv.QUOTE_NONE)

In [29]:
# lastly need to concatenate all 4 to create a new set of transcripts again
cat_cmd = "cat "+base_dir_data+"real.gtf "+base_dir_data+"splicing.gtf "+base_dir_data+"intronic.gtf "+base_dir_data+"RNApol.gtf > "+base_dir_data+"ALL.merged.filtered.gtf"
subprocess.call(cat_cmd,shell=True)

0

In [None]:
agg_cmd = "/ccb/salz8-1/avaraby/tx_noise/soft/gtex_stats/gtex_stats \
           -t "+base_dir_data+"tissues/tissues.lst \
           -a "+base_dir_data+"ALL.merged.filtered.tracking \
           -r "+base_dir_data+"real.gtf \
           -g "+base_dir_data+"ALL.merged.filtered.gtf \
           -s "+base_dir_data+"splicing.gtf \
           -i "+base_dir_data+"intronic.gtf \
           -p "+base_dir_data+"RNApol.gtf \
           -o "+base_dir_out+"res"
# print(agg_cmd)
subprocess.call(agg_cmd,shell=True)