In [10]:
import pandas as pd
import numpy as np
import multiprocessing
import subprocess
import random
import pickle
import shutil
import glob
import math
import csv
import sys
import os
import re

from pybedtools import BedTool

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib_venn import venn3
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.size'] = 24
%matplotlib inline

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [11]:
base_dir = "./"
out_dir = base_dir+"step4/"
soft_dir = base_dir+"soft/"

chess3_gtf_fname = base_dir+"step3/chess.seeder.cds.crs.short_label.ender.chrM.adder.reader.gtf"

data_dir = base_dir+"data/"
gencode_gtf_fname = data_dir+"latest_gtfs/gencode.primary.gtf"
refseq_gtf_fname = data_dir+"latest_gtfs/refseq.primary.gtf"
mane_gtf_fname = data_dir+"latest_gtfs/MANE.primary.gtf"
refgen_gtf_fname = data_dir+"latest_gtfs/refgen.primary.gtf"
chess2_gtf_fname = data_dir+"latest_gtfs/chess2.gtf"
M1_gtf_fname = data_dir+"latest_gtfs/1M.primary.gtf"
M27_gtf_fname = data_dir+"latest_gtfs/27M.primary.gtf"

ref_fasta_fname = "hg38_p12_ucsc.no_alts.no_fixs.fa"

num_threads = 40

In [12]:
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [13]:
# define tags and corresponding values to remove
# False takes precedence over True - if a transcript was flagged as false once - just get rid of it, even if something states otherwise
tvs_filter = {
    "CDS_TYPE":{"remove_values":set(["no_start_stop","orf_no_ppp"]),"keep_if_known":False},
    "confidence":{"remove_values":set(["2"]),"keep_if_known":True},
    "notes":{"remove_values":set(["'short_exon'"]),"keep_if_known":True},
    "reader_status":{"remove_values":set(["fail"]),"keep_if_known":False}
}

In [14]:
# load the annotation to select transcripts
gtf = pd.read_csv(chess3_gtf_fname,sep="\t",comment="#",names=gff3Cols)
gtf = gtf[gtf["type"]=="transcript"].reset_index(drop=True)
gtf["tid"] = gtf["attributes"].str.split("transcript_id \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
all_tids = set(gtf["tid"])

In [15]:
gtf["keep"]=True
gtf["type_attr"]=gtf["attributes"].str.split("type \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
known_tags = set(["unassembled","known","11","1n"])
gtf["known"]=np.where(gtf["type_attr"].isin(known_tags),True,False)
for t,v in tvs_filter.items():
    gtf[t]="-"
    gtf[t] = gtf["attributes"].str.split(t+" \"",expand=True,n=1)[1].str.split("\"",expand=True,n=1)[0]
    gtf["keep"] = np.where(v["keep_if_known"] is False and gtf["known"] is False and gtf[t].isin(v["remove_values"]),False,gtf["keep"])
gtf.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,tid,keep,type_attr,known,CDS_TYPE,confidence,notes,reader_status
0,chr1,StringTie,transcript,11869,14409,.,+,.,"transcript_id ""ALL_00405811""; CDS_TYPE ""noncod...",ALL_00405811,True,known,True,noncoding,,,pass
1,chr1,StringTie,transcript,25776,31097,.,+,.,"transcript_id ""ALL_00000005""; CDS_TYPE ""noncod...",ALL_00000005,True,novel,False,noncoding,1.0,,pass
2,chr1,StringTie,transcript,29554,31097,.,+,.,"transcript_id ""ALL_00000008""; CDS_TYPE ""noncod...",ALL_00000008,True,known,True,noncoding,,,pass
3,chr1,StringTie,transcript,30267,31109,.,+,.,"transcript_id ""ALL_00000009""; CDS_TYPE ""noncod...",ALL_00000009,True,known,True,noncoding,,,pass
4,chr1,StringTie,transcript,180818,184174,.,+,.,"transcript_id ""ALL_02253581""; CDS_TYPE ""noncod...",ALL_02253581,True,novel,False,noncoding,1.0,,pass


In [16]:
tmp_gtf_fname = chess3_gtf_fname+".tmp"

known_tags = set(["unassembled","known","11","1n"])

discarded = 0
kept=0

discarded_reasons = dict({"known":dict(),"novel":dict()})

for k,v in tvs_filter.items():
    discarded_reasons["known"][k]=dict()
    discarded_reasons["novel"][k]=dict()
    for v2 in v["remove_values"]:
        discarded_reasons["known"][k][v2]=0
        discarded_reasons["novel"][k][v2]=0

with open(tmp_gtf_fname,"w+") as outFP:
    with open(chess3_gtf_fname,"r") as inFP:
        keep = True
        for line in inFP:
            if line[0]=="#":
                outFP.write(line)
            lcs = line.split("\t")
            if lcs[2]=="transcript":
                keep=True
                known = True
                if "type \"" in lcs[8]:
                    known = lcs[8].split("type \"",1)[1].split("\"",1)[0] in known_tags
                for t,v in tvs_filter.items():
                    att_val = "-"
                    if t+" \"" in lcs[8]:
                        att_val = lcs[8].split(t+" \"",1)[1].split("\"",1)[0]
                    if att_val in v["remove_values"] and not (known==True and v["keep_if_known"]):
                        keep=False
                        if known:
                            discarded_reasons["known"][t][att_val]+=1
                        else:
                            discarded_reasons["novel"][t][att_val]+=1
            if keep:
                if lcs[2]=="transcript":
                    kept+=1
                outFP.write(line)
            else:
                if lcs[2]=="transcript":
                    discarded+=1
                    
print(kept+discarded,kept,discarded)
print(discarded_reasons)

458079 341558 116521
{'known': {'CDS_TYPE': {'orf_no_ppp': 4670, 'no_start_stop': 19619}, 'confidence': {'2': 0}, 'notes': {"'short_exon'": 0}, 'reader_status': {'fail': 1268}}, 'novel': {'CDS_TYPE': {'orf_no_ppp': 10201, 'no_start_stop': 20021}, 'confidence': {'2': 62076}, 'notes': {"'short_exon'": 0}, 'reader_status': {'fail': 10457}}}


In [None]:
# run gffread to cleanup any duplicates resulting from 
out_gtf_fname = out_dir+"chess.gtf"
gffread_cmd = ["gffread","-F","-T","-M",
               "-o",out_gtf_fname,
               tmp_gtf_fname]
print(" ".join(gffread_cmd))
subprocess.call(gffread_cmd)

In [18]:
discarded_reasons

{'known': {'CDS_TYPE': {'orf_no_ppp': 4670, 'no_start_stop': 19619},
  'confidence': {'2': 0},
  'notes': {"'short_exon'": 0},
  'reader_status': {'fail': 1268}},
 'novel': {'CDS_TYPE': {'orf_no_ppp': 10201, 'no_start_stop': 20021},
  'confidence': {'2': 62076},
  'notes': {"'short_exon'": 0},
  'reader_status': {'fail': 10457}}}