In [24]:
# main imports
import os
import sys
import ast
import glob
import math
import shutil
import random
import importlib
import subprocess

from itertools import product

import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats

import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
import matplotlib.pylab as pylab
import upsetplot
import seaborn as sns

plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.size'] = 24
%matplotlib inline

pd.set_option('display.max_columns', None)

In [25]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, "/ccb/salz4-4/avaraby/orfanage/soft")
%aimport definitions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# paths

base_dir = "/ccb/salz4-4/avaraby/orfanage/"

orfanage_bin = base_dir+"bin/orfanage"
gffcompare_bin = "/ccb/salz7-data/sw2/bin/gffcompare"
gffread_bin = "/ccb/salz7-data/sw2/bin/gffread"
igvtools_bin = "/ccb/salz7-data/sw/bin/igvtools"
sashimi_bin = base_dir+"bin/sashimi.py"
td_bin = base_dir+"soft/transdecoder/TransDecoder.LongOrfs"
tdp_bin = base_dir+"soft/transdecoder/TransDecoder.Predict"
gtf_genome_to_cdna_fasta = base_dir+"soft/transdecoder/util/gtf_genome_to_cdna_fasta.pl"
gtf_to_alignment_gff3 = base_dir+"soft/transdecoder/util/gtf_to_alignment_gff3.pl"
cdna_alignment_orf_to_genome_orf = base_dir+"soft/transdecoder/util/cdna_alignment_orf_to_genome_orf.pl"

fa_fname = base_dir+"data/hg38.fa"
gtf_fname = base_dir+"data/gencode.v41.gtf"
mane_gtf_fname = base_dir+"data/MANE.v10.gtf"

td_outdir = base_dir+"td_gencode_rev1/"
if not os.path.exists(td_outdir):
    os.makedirs(td_outdir)

In [6]:
# arguments
num_threads = 30

In [7]:
gtf_adjstop_fname = gtf_fname.rsplit(".",1)[0]+".adjstop.gtf"
gtf_adjstop_sorted_fname = gtf_adjstop_fname.rsplit(".",1)[0]+".sorted.gtf"
gtf_adjstop_aa_fa_fname = gtf_adjstop_fname.rsplit(".",1)[0]+".aa.fa"

clean_gtf_fname = gtf_adjstop_fname.rsplit(".",1)[0]+".clean.gtf"
nocds_gtf_fname = clean_gtf_fname.rsplit(".",1)[0]+".nocds.gtf"
nocds_gff_fname = nocds_gtf_fname.rsplit(".",1)[0]+".gff3"
nocds_fa_fname = nocds_gtf_fname.rsplit(".",1)[0]+".fa"

out_gtf_fname = td_outdir+"orf.gtf"
out_stats_fname = td_outdir+"orf.stats"
out_gtf_sorted_fname = td_outdir.rsplit(".",1)[0]+".sorted.gtf"

out_df_tsv_fname = td_outdir+"df.tsv"

In [8]:
%%time
# run orfanage with mane as the reference
cmd = [orfanage_bin,
       "--reference",fa_fname,
       "--query",nocds_gtf_fname,
       "--threads",str(num_threads),
       "--output",out_gtf_fname,
       "--stats",out_stats_fname,
       mane_gtf_fname]
print(" ".join(cmd))
subprocess.call(cmd)

/ccb/salz4-4/avaraby/orfanage/bin/orfanage --reference /ccb/salz4-4/avaraby/orfanage/data/hg38.fa --query /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.gtf --threads 30 --output /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/orf.gtf --stats /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/orf.stats /ccb/salz4-4/avaraby/orfanage/data/MANE.v10.gtf


loading reference genome
loading reference transcriptomes
sorting reference transcriptome
loading query transcriptome
bundling transcriptome
starting main evaluation


CPU times: user 7.33 ms, sys: 6.52 ms, total: 13.8 ms
Wall time: 32.6 s


0

In [66]:
%%time
# run orfanage with mane as the reference
cmd = [orfanage_bin,
       "--reference",fa_fname,
       "--query",nocds_gtf_fname,
       "--output",out_gtf_fname,
       "--stats",out_stats_fname,
       mane_gtf_fname]
print(" ".join(cmd))
subprocess.call(cmd)

/ccb/salz4-4/avaraby/orfanage/bin/orfanage --reference /ccb/salz4-4/avaraby/orfanage/data/hg38.fa --query /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.gtf --output /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/orf.gtf --stats /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/orf.stats /ccb/salz4-4/avaraby/orfanage/data/MANE.v10.gtf


loading reference genome
loading reference transcriptomes
sorting reference transcriptome
loading query transcriptome
bundling transcriptome
starting main evaluation


CPU times: user 9.69 ms, sys: 12.8 ms, total: 22.5 ms
Wall time: 36.3 s


0

In [18]:
# extract mane protein fasta for blastp
cmd = [gffread_bin,
       "-g",fa_fname,
       "-y",td_outdir+"MANE.aa.fasta",
       mane_gtf_fname]
print(" ".join(cmd))
subprocess.call(cmd)

/ccb/salz7-data/sw2/bin/gffread -g /ccb/salz4-4/avaraby/orfanage/data/hg38.fa -y /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/MANE.aa.fasta /ccb/salz4-4/avaraby/orfanage/data/MANE.v10.gtf


0

In [40]:
%%time
# construct the transcript fasta file using the genome and the transcripts.gtf file like so:

cvt_cmd = [gtf_genome_to_cdna_fasta,nocds_gtf_fname,fa_fname]

print(" ".join(cvt_cmd)+" > "+nocds_fa_fname)
outFP = open(nocds_fa_fname,"w")
subprocess.call(cvt_cmd,stdout=outFP)
outFP.close()

# Next, convert the transcript structure GTF file to an alignment-GFF3 formatted file (this is done only because our processes operate on gff3 rather than the starting gtf file - nothing of great consequence). Convert gtf to alignment-gff3 like so, using cufflinks GTF output as an example:

gff_cmd = [gtf_to_alignment_gff3,nocds_gtf_fname]
print(" ".join(gff_cmd)+" > "+nocds_gff_fname)
outFP = open(nocds_gff_fname,"w")
subprocess.call(gff_cmd,stdout=outFP)
outFP.close()

# Now, run the process described above to generate your best candidate ORF predictions:

td_cmd = [td_bin,"-S",
          "-t",nocds_fa_fname]
print(" ".join(td_cmd))
subprocess.call(td_cmd)

cmd = ["blastp",
       "-query","/ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/longest_orfs.pep",
       "-db",td_outdir+"MANE.aa.fasta",
       "-max_target_seqs","1",
       "-outfmt","6",
       "-num_threads",str(num_threads)]
blast_fname = "/ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/blastp.outfmt6"

print(" ".join(cmd)+" > "+blast_fname)
outFP = open(blast_fname,"w")
subprocess.call(cmd,stdout=outFP)
outFP.close()

# (optionally, identify peptides with homology to known proteins)
tdp_cmd = [tdp_bin,"--single_best_only",
           "--retain_blastp_hits",blast_fname,
           "-t",nocds_fa_fname]
print(" ".join(tdp_cmd))
subprocess.call(tdp_cmd)

# And finally, generate a genome-based coding region annotation file:
otg_cmd = [cdna_alignment_orf_to_genome_orf,
           "/ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder.gff3",
           nocds_gff_fname,
           nocds_fa_fname]

print(" ".join(otg_cmd)+" > "+td_outdir+"td.genome.gff3")
outFP = open(td_outdir+"td.genome.gff3","w")
subprocess.call(otg_cmd,stdout=outFP)
outFP.close()

# convert to gtf
td_gtf_fname = td_outdir+"td.genome.gtf"
cmd = ["gffread","-T","-o",td_gtf_fname,td_outdir+"td.genome.gff3"]
print(" ".join(cmd))
subprocess.call(cmd)

/ccb/salz4-4/avaraby/orfanage/soft/transdecoder/util/gtf_genome_to_cdna_fasta.pl /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.gtf /ccb/salz4-4/avaraby/orfanage/data/hg38.fa > /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.fa


-parsing cufflinks output: /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.gtf
-parsing genome fasta: /ccb/salz4-4/avaraby/orfanage/data/hg38.fa
-done parsing genome.
// processing chr1
// processing chr10
// processing chr11
// processing chr12
// processing chr13
// processing chr14
// processing chr15
// processing chr16
// processing chr17
// processing chr18
// processing chr19
// processing chr2
// processing chr20
// processing chr21
// processing chr22
// processing chr3
// processing chr4
// processing chr5
// processing chr6
// processing chr7
// processing chr8
// processing chr9
// processing chrM
// processing chrX
// processing chrY


/ccb/salz4-4/avaraby/orfanage/soft/transdecoder/util/gtf_to_alignment_gff3.pl /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.gtf > /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.gff3
/ccb/salz4-4/avaraby/orfanage/soft/transdecoder/TransDecoder.LongOrfs -S -t /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.fa
blastp -query /ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/longest_orfs.pep -db /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/MANE.aa.fasta -max_target_seqs 1 -outfmt 6 -num_threads 30 > /ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/blastp.outfmt6
/ccb/salz4-4/avaraby/orfanage/soft/transdecoder/TransDecoder.Predict --single_best_only --retain_blastp_hits /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/blastp.outfmt6 -t /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.fa


-- Skipping CMD: /ccb/salz4-4/avaraby/orfanage/soft/transdecoder/util/compute_base_probs.pl /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.fa 1 > /ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/base_freqs.dat, checkpoint [/ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/__checkpoints_longorfs/base_freqs_file.ok] exists.
-skipping long orf extraction, already completed earlier as per checkpoint: /ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/__checkpoints_longorfs/TD.longorfs.ok
-- Skipping CMD: /ccb/salz4-4/avaraby/orfanage/soft/transdecoder/util/get_top_longest_fasta_entries.pl gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/longest_orfs.cds 5000 5000 > gencode.v41.adjstop.clean.nocds.fa.transdecoder_dir/longest_orfs.cds.top_longest_5000, checkpoint [/ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transde

/ccb/salz4-4/avaraby/orfanage/soft/transdecoder/util/cdna_alignment_orf_to_genome_orf.pl /ccb/salz4-4/avaraby/orfanage/notebooks/gencode.v41.adjstop.clean.nocds.fa.transdecoder.gff3 /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.gff3 /ccb/salz4-4/avaraby/orfanage/data/gencode.v41.adjstop.clean.nocds.fa > /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/td.genome.gff3





	Done.  126635 / 126635 transcript orfs could be propagated to the genome



gffread -T -o /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/td.genome.gtf /ccb/salz4-4/avaraby/orfanage/td_gencode_rev1/td.genome.gff3
CPU times: user 18.5 ms, sys: 111 ms, total: 130 ms
Wall time: 5min 30s


0

In [54]:
# use gffread -J to identify cases which have missing start/stop codons
td_clean_gtf_fname = td_outdir+"td.clean.genome.gtf"
j_cmd = ["gffread","-T","-g",fa_fname,"-o",td_clean_gtf_fname,td_gtf_fname]
subprocess.call(j_cmd)

0

In [67]:
# load chains
td_df = definitions.get_chains(td_clean_gtf_fname,"CDS",True)
# clean extra tags from tid
td_df.rename({"tid":"td_tid"},axis=1,inplace=True)
td_df["tid"] = td_df["td_tid"].str.rsplit(".",n=1,expand=True)[0]
# extract length from the chain
td_df["clen"] = td_df.apply(lambda row: definitions.clen(row["chain"]),axis=1)
# sort and remove duplicates keeping only the longest ORF
td_df.sort_values(by=["tid","clen"],ascending=False,inplace=True)
print(len(td_df))
td_df.drop_duplicates(["tid"],keep="first",inplace=True)
td_df.drop("td_tid",axis=1,inplace=True)
print(len(td_df))

# remove junk and rename
td_df = td_df[["tid","has_cds","chain"]]
td_df.columns = ["tid","td_has_cds","td_chain"]


# load original chains
def_df = definitions.get_chains(clean_gtf_fname,"CDS",True)
# add gene ids
def_attr = definitions.get_attribute(clean_gtf_fname,"gene_id")
def_attr.columns = ["tid","gid"]
def_df = def_df.merge(def_attr,on="tid",how="left")
def_df = def_df[["tid","gid","has_cds","chain"]]
def_df.columns = ["tid","gid","def_has_cds","def_chain"]


# load orfanage results
orf_df = definitions.get_chains(out_gtf_fname,"CDS",True)
orf_df = orf_df[["tid","has_cds","chain"]]
orf_df.columns = ["tid","orf_has_cds","orf_chain"]


# load mane
mane_df = definitions.get_chains(mane_gtf_fname,"CDS",True)
# add gene ids
mane_attr = definitions.get_attribute(mane_gtf_fname,"gene_id")
mane_attr.columns = ["tid","gid"]
mane_df = mane_df.merge(def_attr,on="tid",how="left")
mane_df = mane_df[["tid","gid","chain","strand","coords"]]
mane_df.columns = ["mane_tid","gid","mane_chain","strand","coords"]


# merge all dataframes into one
df = def_df.merge(mane_df,on="gid",how="left")
df = df[df["gid"].isin(set(mane_df["gid"]))].reset_index(drop=True)
df = df.merge(td_df,on="tid",how="left")
df = df.merge(orf_df,on="tid",how="left")
df["td_has_cds"] = df["td_has_cds"].fillna(0)
df["td_chain"] = df.apply(lambda row: list() if row["td_chain"]!=row["td_chain"] else row["td_chain"],axis=1)

# df = df[df["def_has_cds"]==1].reset_index(drop=True)

all_same = df[(df["td_chain"]==df["orf_chain"])&\
              (df["orf_chain"]==df["def_chain"])].reset_index(drop=True)

orf_eq_def_ne_td = df[~(df["td_chain"]==df["orf_chain"])&\
                       (df["orf_chain"]==df["def_chain"])].reset_index(drop=True)

td_eq_def_ne_orf = df[~(df["td_chain"]==df["orf_chain"])&\
                       (df["td_chain"]==df["def_chain"])].reset_index(drop=True)

ad = df[~(df["td_chain"]==df["orf_chain"])&\
              ~(df["td_chain"]==df["def_chain"])&\
              ~(df["orf_chain"]==df["def_chain"])].reset_index(drop=True)

print("total number of transcripts: "+str(len(df)))
print("all equal: "+str(len(all_same)))
print("ORFanage==GENCODE!=TransDecoder: "+str(len(orf_eq_def_ne_td)))
print("Of those, these many are coding in TransDecoder: "+str(len(orf_eq_def_ne_td[orf_eq_def_ne_td["td_has_cds"]==1])))
print("Of those, these many are coding in both GENCODE and ORFanage: "+str(len(orf_eq_def_ne_td[orf_eq_def_ne_td["orf_has_cds"]==1])))
print("Of those, these many are non-coding in both GENCODE and ORFanage: "+str(len(orf_eq_def_ne_td[orf_eq_def_ne_td["orf_has_cds"]==0])))
print("ORFanage!=GENCODE==TransDecoder: "+str(len(td_eq_def_ne_orf)))
print("Of those, these many are coding in both GENCODE and TransDecoder: "+str(len(td_eq_def_ne_orf[td_eq_def_ne_orf["td_has_cds"]==1])))
print("Of those, these many are non-coding in both GENCODE and TransDecoder: "+str(len(td_eq_def_ne_orf[td_eq_def_ne_orf["td_has_cds"]==0])))

print("All different: "+str(len(ad)))

126635
126635
total number of transcripts: 128154
all equal: 56907
ORFanage==GENCODE!=TransDecoder: 26788
Of those, these many are coding in TransDecoder: 23428
Of those, these many are coding in both GENCODE and ORFanage: 17864
Of those, these many are non-coding in both GENCODE and ORFanage: 8924
ORFanage!=GENCODE==TransDecoder: 10817
Of those, these many are coding in both GENCODE and TransDecoder: 1368
Of those, these many are non-coding in both GENCODE and TransDecoder: 9449
All different: 14192


In [68]:
orf_tp_df = (df[(df["def_has_cds"]==1)&(df["orf_chain"]==df["def_chain"])])
orf_fp_df = (df[(df["def_has_cds"]==0)&(df["orf_has_cds"]==1)])
orf_tn_df = (df[(df["def_has_cds"]==0)&(df["orf_chain"]==df["def_chain"])])
orf_fn_df = (df[(df["def_has_cds"]==1)&~(df["orf_chain"]==df["def_chain"])])

td_tp_df = (df[(df["def_has_cds"]==1)&(df["td_chain"]==df["def_chain"])])
td_fp_df = (df[(df["def_has_cds"]==0)&(df["td_has_cds"]==1)])
td_tn_df = (df[(df["def_has_cds"]==0)&(df["td_chain"]==df["def_chain"])])
td_fn_df = (df[(df["def_has_cds"]==1)&~(df["td_chain"]==df["def_chain"])])

In [69]:
orf_tpr = len(orf_tp_df)/(len(orf_tp_df)+len(orf_fn_df))
print("TPR ORFanage: "+str(orf_tpr))

td_tpr = len(td_tp_df)/(len(td_tp_df)+len(td_fn_df))
print("TPR TransDecoder: "+str(td_tpr))

TPR ORFanage: 0.8769287742254043
TPR TransDecoder: 0.6506741280226035
