In [None]:
import pandas as pd
import numpy as np
import multiprocessing
import subprocess
import random
import pickle
import shutil
import glob
import math
import csv
import sys
import os
import re

from pybedtools import BedTool

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib_venn import venn3
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.size'] = 24
%matplotlib inline

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [None]:
base_dir = "./"
out_dir = base_dir+"step3/"
soft_dir = base_dir+"soft/"

chess3_gtf_fname = base_dir+"step2/chess.gtf"

data_dir = base_dir+"data/"
gencode_gtf_fname = data_dir+"latest_gtfs/gencode.primary.gtf"
refseq_gtf_fname = data_dir+"latest_gtfs/refseq.primary.gtf"
mane_gtf_fname = data_dir+"latest_gtfs/MANE.primary.gtf"
refgen_gtf_fname = data_dir+"latest_gtfs/refgen.primary.gtf"
chess2_gtf_fname = data_dir+"latest_gtfs/chess2.gtf"
M1_gtf_fname = data_dir+"latest_gtfs/1M.primary.gtf"
M27_gtf_fname = data_dir+"latest_gtfs/27M.primary.gtf"

seeder_fname = soft_dir+"scripts/genomic_scripts/seeder/seeder"
reader_fname = soft_dir+"scripts/genomic_scripts/reader/reader"
adder_fname = soft_dir+"scripts/genomic_scripts/adder/adder"
orfanage_fname = soft_dir+"scripts/genomic_scripts/orfanage/orfanage"
ender_fname = soft_dir+"scripts/genomic_scripts/ender/ender"
ppp_bin = "phylocsf++"

ref_fasta_fname = "hg38_p12_ucsc.no_alts.no_fixs.fa"
ppp_track = "tracks/PhyloCSF+1.bw"

num_threads = 40

In [None]:
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
# since gffcompare does not readily recognize single-exon transcripts we need to use a dedicated solution
# use coverage data from tiebrush to check whether a single-exon transcript is covered
# since alignment of single-exon transcripts does not really care for the strand - we disregard strandedness
# anything with coverage >= % is included

out_gtf_fname = M27_gtf_fname.rstrip(".gtf")+".seeder.gtf"
seeder_cmd = [seeder_fname,
             "-i",M27_gtf_fname,
             "-r",mane_gtf_fname,
             "-o",out_gtf_fname,
             "-p","80"]
print(" ".join(seeder_cmd))
subprocess.call(seeder_cmd)

# load all IDs already in chess
chess_tids = set()

with open(chess3_gtf_fname,"r") as inFP:
    for line in inFP:
        lcs = line.rstrip("\n").split("\t")
        if lcs[2]=="transcript":
            tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
            chess_tids.add(tid)

# add missing transcripts to chess
seeder_out_gtf_fname = out_dir+chess3_gtf_fname.split("/")[-1].rstrip(".gtf")+".seeder.gtf"

with open(seeder_out_gtf_fname,"w+") as outFP:
    with open(chess3_gtf_fname,"r") as inFP: # write previous version first
        for line in inFP:
            outFP.write(line)
    with open(out_gtf_fname,"r") as inFP:
        for line in inFP:
            lcs = line.rstrip("\n").split("\t")
            if lcs[2]=="transcript":
                tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
                seeder_type = lcs[8].split("seeder_match \"",1)[1].split("\"",1)[0]
                if tid in chess_tids or seeder_type == "0":
                    write_cur = False
                    continue
                else:
                    write_cur = True
            
            if write_cur:
                if lcs[2]=="exon" or lcs[2]=="transcript": # skip cds for now if present
                    outFP.write(line)

In [None]:
# run orfanage and ppp to append CDSs
chess_gtf_fname = seeder_out_gtf_fname
orfppp_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".cds.gtf"
orfppp_cmd = [base_dir+"soft/pipeline/run_orfanage_ppp.py",
             "--input",chess_gtf_fname,
             "--phylocsfpp",ppp_bin,
             "--threads",str(num_threads),
             "--ppp_idx",ppp_track,
             "--reference",ref_fasta_fname,
             "--orfanage",orfanage_fname,
             "--output",orfppp_out_gtf_fname,
             "--annotations",gencode_gtf_fname+","+refseq_gtf_fname+","+mane_gtf_fname]
print(" ".join(orfppp_cmd))
# subprocess.call(orfppp_cmd)

In [None]:
# search for CRS
chess_gtf_fname = orfppp_out_gtf_fname
crs_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".crs.gtf"
crs_cmd = [base_dir+"soft/pipeline/find_crs.py",
             "--input",chess_gtf_fname,
             "--output",crs_out_gtf_fname,
             "--crs",base_dir+"data/crfs/Supplemental_Data_1__patch_2020-07-28_fix_hg38_coordinates.csv"]
print(" ".join(crs_cmd))
subprocess.call(crs_cmd)

In [None]:
# examine short exons
chess_gtf_fname = crs_out_gtf_fname
short_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".short_label.gtf"
short_cmd = [base_dir+"soft/pipeline/examine_short_exons.py",
             "--input",chess_gtf_fname,
             "--output",short_out_gtf_fname,
             "--annotations","GENCODE:"+gencode_gtf_fname+",RefSeq:"+refseq_gtf_fname+",MANE:"+mane_gtf_fname]
print(" ".join(short_cmd))
subprocess.call(short_cmd)

In [None]:
# fix ends of transcripts
chess_gtf_fname = short_out_gtf_fname
ender_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".ender.gtf"
ender_cmd = [ender_fname,
             "-s","-t","100","-m","100",
             "-i",chess_gtf_fname,
             "-r",mane_gtf_fname+","+refseq_gtf_fname+","+gencode_gtf_fname,
             "-o",ender_out_gtf_fname]

print(" ".join(ender_cmd))
subprocess.call(ender_cmd)

In [None]:
# replace chromosome M
chess_gtf_fname = ender_out_gtf_fname
chrM_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".chrM.gtf"
chrM_cmd = [base_dir+"soft/pipeline/replace_chrM.py",
             "--input",chess_gtf_fname,
             "--output",chrM_out_gtf_fname,
             "--annotation",gencode_gtf_fname]
print(" ".join(chrM_cmd))
subprocess.call(chrM_cmd)

In [None]:
# add mising transcripts from MANE
chess_gtf_fname = chrM_out_gtf_fname
adder_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".adder.gtf"
adder_cmd = [adder_fname,
             "-i",chess_gtf_fname,
             "-r","MANE:"+mane_gtf_fname,
             "-o",adder_out_gtf_fname]

print(" ".join(adder_cmd))
subprocess.call(adder_cmd)

In [None]:
# filter readthroughs
chess_gtf_fname = adder_out_gtf_fname
reader_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".reader.gtf"
reader_cmd = [base_dir+"soft/pipeline/filter_readthrough.py",
             "--input",chess_gtf_fname,
             "--annotations","RefGEN:"+refgen_gtf_fname+",MANE:"+mane_gtf_fname+",RefSeq:"+refseq_gtf_fname,
             "--output",reader_out_gtf_fname,
             "--reader",reader_fname]

print(" ".join(reader_cmd))
subprocess.call(reader_cmd)

In [None]:
# assign gene and transcript IDs with namer
chess_gtf_fname = reader_out_gtf_fname
namer_out_gtf_fname = chess_gtf_fname.rstrip(".gtf")+".namer.gtf"
namer_cmd = [namer_fname,
             "-i",chess_gtf_fname,
             "-r",chess2_gtf_fname,
             "-o",namer_out_gtf_fname]

print(" ".join(namer_cmd))
subprocess.call(namer_cmd)