In [3]:
# main imports
import os
import sys
import ast
import copy
import glob
import math
import shutil
import random
import importlib
import subprocess

from itertools import product

import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats

import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
import matplotlib.pylab as pylab
import upsetplot
import seaborn as sns

plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.size'] = 24
%matplotlib inline

pd.set_option('display.max_columns', None)

In [4]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, "") # directory to where definitions.py is
%aimport definitions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# paths

base_dir = "" # base directory for the analysis

orfanage_bin = "orfanage" # path to the orfanage binary
gffcompare_bin = "" # path to the gffcompare binary
gffread_bin = "" # path to the gffread binary
igvtools_bin = "" # path to the igvtools binary
sashimi_bin = "" # path to the sashimi.py binary (from the TieBrush package)
gmst_bin = "gmst.pl" # path to the GeneMarkS-T binary

fa_fname = "hg38.fa" # path to the reference genome fasta file
gtf_fname = "refseq.gtf" # path to the reference genome gtf file
mane_gtf_fname = "MANE.v10.gtf" # path to the MANE gtf file

outdir = base_dir+"gmst_refseq/"
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [6]:
# arguments
num_threads = 30

In [7]:
gtf_adjstop_fname = gtf_fname.rsplit(".",1)[0]+".adjstop.gtf"
gtf_adjstop_sorted_fname = gtf_adjstop_fname.rsplit(".",1)[0]+".sorted.gtf"
gtf_adjstop_aa_fa_fname = gtf_adjstop_fname.rsplit(".",1)[0]+".aa.fa"

clean_gtf_fname = gtf_adjstop_fname.rsplit(".",1)[0]+".clean.gtf"
nocds_gtf_fname = clean_gtf_fname.rsplit(".",1)[0]+".nocds.gtf"
nocds_gff_fname = nocds_gtf_fname.rsplit(".",1)[0]+".gff3"
nocds_fa_fname = nocds_gtf_fname.rsplit(".",1)[0]+".fa"

out_gtf_fname = outdir+"orf.gtf"
out_stats_fname = outdir+"orf.stats"
out_gtf_sorted_fname = outdir.rsplit(".",1)[0]+".sorted.gtf"

out_df_tsv_fname = outdir+"df.tsv"

In [None]:
%%time
# run orfanage with mane as the reference
cmd = [orfanage_bin,
       "--reference",fa_fname,
       "--query",nocds_gtf_fname,
       "--threads",str(num_threads),
       "--output",out_gtf_fname,
       "--stats",out_stats_fname,
       mane_gtf_fname]
print(" ".join(cmd))
subprocess.call(cmd)

In [None]:
%%time
# run orfanage with mane as the reference
cmd = [orfanage_bin,
       "--reference",fa_fname,
       "--query",nocds_gtf_fname,
       "--output",out_gtf_fname,
       "--stats",out_stats_fname,
       mane_gtf_fname]
print(" ".join(cmd))
subprocess.call(cmd)

In [None]:
cmd = [gffread_bin,"-T","-o",out_gtf_fname.split(".gtf")[0]+".gffread.gtf",out_gtf_fname]
print(" ".join(cmd))
subprocess.call(cmd)
igv_cmd = [igvtools_bin,"sort",out_gtf_fname.split(".gtf")[0]+".gffread.gtf",out_gtf_fname.split(".gtf")[0]+".sorted.gtf"]
print(" ".join(igv_cmd))
subprocess.call(igv_cmd)
igv_cmd = [igvtools_bin,"index",out_gtf_fname.split(".gtf")[0]+".sorted.gtf"]
print(" ".join(igv_cmd))
subprocess.call(igv_cmd)

In [None]:
# extract nt fasta with gffread
cmd = [gffread_bin,
       "-g",fa_fname,
       "-w",outdir+"tx.nt.fasta",
       nocds_gtf_fname]
print(" ".join(cmd))
subprocess.call(cmd)

In [None]:
%%time
cmd = [gmst_bin,"--strand","direct",
                "--output",outdir+"gmst.res",
                "--format","GFF",
                outdir+"tx.nt.fasta"]
print(" ".join(cmd))
subprocess.call(cmd)

In [12]:
chains = definitions.get_chains(nocds_gtf_fname,"exon",True)
chains.head()

Unnamed: 0,tid,has_cds,seqid,strand,coords,chain
0,rna-NR_046018.2,1,chr1,+,chr1:11874-14409,"[(11874, 12227), (12613, 12721), (13221, 14409)]"
1,rna-NR_024540.1,1,chr1,-,chr1:14362-29370,"[(14362, 14829), (14970, 15038), (15796, 15947..."
2,rna-NR_106918.1,1,chr1,-,chr1:17369-17436,"[(17369, 17436)]"
3,rna-MIR6859-1,1,chr1,-,chr1:17369-17391,"[(17369, 17391)]"
4,rna-MIR6859-1-2,1,chr1,-,chr1:17409-17431,"[(17409, 17431)]"


In [None]:
# need to convert to genomic coordinates now

orfs = dict()

count = 0
with open(outdir+"gmst.res","r") as inFP:
    for line in inFP:
        if line[0]=="#":
            continue
        if len(line.strip())==0:
            continue
        
        lcs = line.strip().split("\t")
        if not lcs[2]=="CDS":
            print(lcs[2])
            
        tstart = int(lcs[3])-1
        tend = int(lcs[4])-1
        tid = lcs[0]
        
        chain = chains[chains["tid"]==tid].iloc[0].chain
        strand = chains[chains["tid"]==tid].iloc[0].strand
        
        gstart = definitions.trans2genome(chain,strand,tstart)
        gend = definitions.trans2genome(chain,strand,tend)
        
        orf = definitions.cut_chain(chain,min(gstart,gend),max(gstart,gend))
        orfs.setdefault(tid,list())
        orfs[tid].append(orf)
        
        
        count+=1
        if count%10000==0:
            print(count)

In [None]:
# now we just need ot output results into a GTF
multi_orfs = set()
with open(outdir+"gmst.clean.genome.gtf","w+") as outFP:
    with open(nocds_gtf_fname,"r") as inFP:
        for line in inFP:
                
            lcs = line.strip().split('\t')
            tid = lcs[8].split("transcript_id \"", 1)[1].split("\"", 1)[0]
            # if not tid in orfs:
            #     continue
            
            if lcs[2] in ["transcript","exon"]:
                outFP.write(line)
            
            if lcs[2] == "transcript":
                if not tid in orfs:
                    continue
                if len(orfs[tid])>1:
                    print("multiple orfs: "+tid)
                    multi_orfs.add(tid)
                    continue
                    
                cds_lcs = copy.deepcopy(lcs)
                cds_lcs[2] = "CDS"
                cds_lcs[8] = "transcript_id \""+tid+"\";"
                
                for orf in orfs[tid]:
                    for cs,ce in orf:
                        cds_lcs[3] = str(int(cs))
                        cds_lcs[4] = str(int(ce))
                        outFP.write("\t".join(cds_lcs)+"\n")

In [None]:
cmd = [gffread_bin,"-T","-g",fa_fname,"-o",outdir+"gmst.gffread.gtf",outdir+"gmst.clean.genome.gtf"]
print(" ".join(cmd))
subprocess.call(cmd)
igv_cmd = [igvtools_bin,"sort",outdir+"gmst.gffread.gtf",outdir+"gmst.sorted.gtf"]
print(" ".join(igv_cmd))
subprocess.call(igv_cmd)
igv_cmd = [igvtools_bin,"index",outdir+"gmst.sorted.gtf"]
print(" ".join(igv_cmd))
subprocess.call(igv_cmd)

In [37]:
# load chains
gmst_df = definitions.get_chains(outdir+"gmst.sorted.gtf","CDS",True)
# extract length from the chain
gmst_df["clen"] = gmst_df.apply(lambda row: definitions.clen(row["chain"]),axis=1)
# sort and remove duplicates keeping only the longest ORF
gmst_df.sort_values(by=["tid","clen"],ascending=False,inplace=True)
print(len(gmst_df))

# remove junk and rename
gmst_df = gmst_df[["tid","has_cds","chain"]]
gmst_df.columns = ["tid","gmst_has_cds","gmst_chain"]


# load original chains
def_df = definitions.get_chains(clean_gtf_fname,"CDS",True)
# add gene ids
def_attr = definitions.get_attribute(clean_gtf_fname,"gene_id")
def_attr.columns = ["tid","gid"]
def_df = def_df.merge(def_attr,on="tid",how="left")
def_df = def_df[["tid","gid","has_cds","chain"]]
def_df.columns = ["tid","gid","def_has_cds","def_chain"]


# load orfanage results
orf_df = definitions.get_chains(out_gtf_fname,"CDS",True)
orf_df = orf_df[["tid","has_cds","chain"]]
orf_df.columns = ["tid","orf_has_cds","orf_chain"]


# load mane
mane_df = definitions.get_chains(mane_gtf_fname,"CDS",True)
# add gene ids
mane_attr = definitions.get_attribute(mane_gtf_fname,"gene_id")
mane_attr.columns = ["tid","gid"]
mane_df = mane_df.merge(def_attr,on="tid",how="left")
mane_df = mane_df[["tid","gid","chain","strand","coords"]]
mane_df.columns = ["mane_tid","gid","mane_chain","strand","coords"]


# merge all dataframes into one
df = def_df.merge(mane_df,on="gid",how="left")
df = df[df["gid"].isin(set(mane_df["gid"]))].reset_index(drop=True)
df = df.merge(gmst_df,on="tid",how="left")
df = df.merge(orf_df,on="tid",how="left")
df["gmst_has_cds"] = df["gmst_has_cds"].fillna(0)
df["gmst_chain"] = df.apply(lambda row: list() if row["gmst_chain"]!=row["gmst_chain"] else row["gmst_chain"],axis=1)

# no_cds_df = df[df["def_has_cds"]==1].reset_index(drop=True)
# df = df[df["def_has_cds"]==1].reset_index(drop=True)

all_same = df[(df["gmst_chain"]==df["orf_chain"])&\
              (df["orf_chain"]==df["def_chain"])].reset_index(drop=True)

orf_eq_def_ne_gmst = df[~(df["gmst_chain"]==df["orf_chain"])&\
                       (df["orf_chain"]==df["def_chain"])].reset_index(drop=True)

gmst_eq_def_ne_orf = df[~(df["gmst_chain"]==df["orf_chain"])&\
                       (df["gmst_chain"]==df["def_chain"])].reset_index(drop=True)

ad = df[~(df["gmst_chain"]==df["orf_chain"])&\
              ~(df["gmst_chain"]==df["def_chain"])&\
              ~(df["orf_chain"]==df["def_chain"])].reset_index(drop=True)

print("total number of transcripts: "+str(len(df)))
print("all equal: "+str(len(all_same)))
print("ORFanage==RefSeq!=GeneMarkS-T: "+str(len(orf_eq_def_ne_gmst)))
print("Of those, these many are coding in GeneMarkS-T: "+str(len(orf_eq_def_ne_gmst[orf_eq_def_ne_gmst["gmst_has_cds"]==1])))
print("Of those, these many are coding in both RefSeq and ORFanage: "+str(len(orf_eq_def_ne_gmst[orf_eq_def_ne_gmst["orf_has_cds"]==1])))
print("Of those, these many are non-coding in both RefSeq and ORFanage: "+str(len(orf_eq_def_ne_gmst[orf_eq_def_ne_gmst["orf_has_cds"]==0])))
print("ORFanage!=RefSeq==GeneMarkS-T: "+str(len(gmst_eq_def_ne_orf)))
print("Of those, these many are coding in both RefSeq and GeneMarkS-T: "+str(len(gmst_eq_def_ne_orf[gmst_eq_def_ne_orf["gmst_has_cds"]==1])))
print("Of those, these many are non-coding in both RefSeq and GeneMarkS-T: "+str(len(gmst_eq_def_ne_orf[gmst_eq_def_ne_orf["gmst_has_cds"]==0])))

print("All different: "+str(len(ad)))

179272
total number of transcripts: 135751
all equal: 84645
ORFanage==RefSeq!=GeneMarkS-T: 33088
Of those, these many are coding in GeneMarkS-T: 31950
Of those, these many are coding in both RefSeq and ORFanage: 32910
Of those, these many are non-coding in both RefSeq and ORFanage: 178
ORFanage!=RefSeq==GeneMarkS-T: 5012
Of those, these many are coding in both RefSeq and GeneMarkS-T: 4297
Of those, these many are non-coding in both RefSeq and GeneMarkS-T: 715
All different: 5751


In [38]:
orf_tp_df = (df[(df["def_has_cds"]==1)&(df["orf_chain"]==df["def_chain"])])
orf_fp_df = (df[(df["def_has_cds"]==0)&(df["orf_has_cds"]==1)])
orf_tn_df = (df[(df["def_has_cds"]==0)&(df["orf_chain"]==df["def_chain"])])
orf_fn_df = (df[(df["def_has_cds"]==1)&~(df["orf_chain"]==df["def_chain"])])

gmst_tp_df = (df[(df["def_has_cds"]==1)&(df["gmst_chain"]==df["def_chain"])])
gmst_fp_df = (df[(df["def_has_cds"]==0)&(df["gmst_has_cds"]==1)])
gmst_tn_df = (df[(df["def_has_cds"]==0)&(df["gmst_chain"]==df["def_chain"])])
gmst_fn_df = (df[(df["def_has_cds"]==1)&~(df["gmst_chain"]==df["def_chain"])])

In [39]:
orf_tpr = len(orf_tp_df)/(len(orf_tp_df)+len(orf_fn_df))
print("TPR ORFanage: "+str(orf_tpr))

gmst_tpr = len(gmst_tp_df)/(len(gmst_tp_df)+len(gmst_fn_df))
print("TPR GeneMarkS-T: "+str(gmst_tpr))

TPR ORFanage: 0.9394080185845316
TPR GeneMarkS-T: 0.7102014659350342
