In [2]:
import os
import sys
import time
import glob
import shutil
import argparse
import subprocess
import numpy as np
import pandas as pd

In [3]:
base_dir = "./"
ref_gen_intersection_gff_fname = "refseq_vs_gencode_intersection.prot_lnc.no_alts.no_fixs.clean.gff"
tb_filtered_gtf_fname = "ALL_TBref.annotated.gtf"
genome_fa = "hg38_p12_ucsc.no_alts.no_fixs.fa"
num_threads = 10
num_samples_per_call = 20

In [None]:
step1_cmd = [base_dir+"soft/pipeline/process_assembly.py",
             "--input",base_dir+"data/GTEx_crams.csv",
             "--outdir",base_dir+"step1",
             "--threads",str(num_threads),
             "--num_samples_per_call",str(num_samples_per_call),
             "--ALL_tracking",base_dir+"data/ALL.combined.tracking",
             "--ALL_gtf",base_dir+"data/ALL.combined.gtf",
             "--tiecov",base_dir+"soft/scripts/tiecov_sjs/tiecov_sjs",
             "--assembly_stats",base_dir+"soft/scripts/assembly_stats/assembly_stats",
             "--tissue_trackings",base_dir+"data/tissues.lst",
             "--tissue_gtfs",base_dir+"data/tissues.gtfs",
             "--reference",genome_fa+".fai"]
print(" ".join(step1_cmd))
subprocess.call(step1_cmd)

In [29]:
# obtain a version of TB filtered file with only "=" transcripts
with open(base_dir+"data/tb.filtered.eq.gtf","w+") as outFP:
    with open(tb_filtered_gtf_fname,"r") as inFP:
        cur_tid = None
        for line in inFP:
            line = line.rstrip("\n")
            cols = line.split("\t")
            if cols[2]=="transcript":
                code = cols[8].split("class_code \"",1)[1].split("\"",1)[0]
                if code=="=":
                    cur_tid = cols[8].split("transcript_id \"",1)[1].split("\"",1)[0]
                    outFP.write(line+"\n")
                else:
                    cur_tid = None
            else:
                if cur_tid is not None:
                    outFP.write(line+"\n")

In [4]:
# write a setup file for fetching annotations and fetch
with open(base_dir+"data/setup.fetch_annotations","w+") as outFP:
    outFP.write("chess2,http://ccb.jhu.edu/chess/data/chess2.2_assembly.gtf.gz"+"\n")
    outFP.write("refgen,"+ref_gen_intersection_gff_fname+"\n")
    outFP.write("refseq,https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz"+"\n")
    outFP.write("gencode,http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.primary_assembly.annotation.gtf.gz"+"\n")
    outFP.write("1M,"+base_dir+"data/tb.filtered.eq.gtf"+"\n")
    outFP.write("27M,"+base_dir+"data/ALL.combined.gtf"+"\n")
    outFP.write("MANE,https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.ensembl_genomic.gtf.gz"+"\n")
    outFP.write("noncode,http://www.noncode.org/datadownload/NONCODEv5_human_hg38_lncRNA.gtf.gz"+"\n")
    outFP.write("ensembl,ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz"+"\n")
    outFP.write("lncpedia,https://lncipedia.org/downloads/lncipedia_5_2/full-database/lncipedia_5_2_hg38.gff"+"\n")

fetch_cmd = [base_dir+"soft/scripts/genomic_scripts/fetch_annotations/fetch_annotations.py",
             "-s",base_dir+"data/setup.fetch_annotations",
             "-o",base_dir+"data/latest_gtfs/"]
# subprocess.call(fetch_cmd)

In [None]:
## for each also create a version on the same scaffolds as the reference genome used in alignment and assembly

ref_seqids = set()
with open(genome_fa+".fai","r") as inFP:
    for line in inFP:
        lcs = line.rstrip().split("\t")
        seqid = lcs[0]
        ref_seqids.add(seqid)

gtf_fnames = list()
with open(base_dir+"data/setup.fetch_annotations","r") as setupFP:
    for line in setupFP:
        tmp = base_dir+"data/latest_gtfs/"+line.split(",")[0]+".gtf"
        if os.path.exists(tmp):
            gtf_fnames.append(tmp)
            
for fname in gtf_fnames:
    out_fname = fname.rstrip(".gtf")
    out_fname += ".primary.gtf"
    print(fname,out_fname)
    with open(out_fname,"w+") as outFP:
        with open(fname,"r") as inFP:
            for line in inFP:
                if line[0]=="#":
                    outFP.write(line)
                seqid = line.split("\t")[0]
                if seqid in ref_seqids:
                    outFP.write(line)

In [37]:
# all v all to find everything known from TB set

if not os.path.exists(base_dir+"data/latest_gtfs/gffcmp_multi.1M/"):
    os.makedirs(base_dir+"data/latest_gtfs/gffcmp_multi.1M/")

setup_fname = base_dir+"data/latest_gtfs/gffcmp_multi.1M/gffcmp_multi.1M.setup.csv"
with open(setup_fname,"w+") as setup_fp:
    setup_fp.write("1M,"+base_dir+"data/latest_gtfs/1M.primary.gtf\n")
    setup_fp.write("gencode,"+base_dir+"data/latest_gtfs/gencode.primary.gtf\n")
    setup_fp.write("refseq,"+base_dir+"data/latest_gtfs/refseq.primary.gtf\n")
    setup_fp.write("refgen,"+base_dir+"data/latest_gtfs/refgen.primary.gtf\n")
    setup_fp.write("MANE,"+base_dir+"data/latest_gtfs/MANE.primary.gtf\n")

gcm_cmd = [base_dir+"soft/scripts/genomic_scripts/gffcmp_multi/gffcmp_multi.py",
           "-s",base_dir+"data/latest_gtfs/gffcmp_multi.1M/gffcmp_multi.1M.setup.csv",
           "-o",base_dir+"data/latest_gtfs/gffcmp_multi.1M/gffcmp_multi.1M",
           "--keep-tmp"]
subprocess.call(gcm_cmd)

0

In [None]:
# all v all to find everything known from original set

if not os.path.exists(base_dir+"data/latest_gtfs/gffcmp_multi.27M/"):
    os.makedirs(base_dir+"data/latest_gtfs/gffcmp_multi.27M/")

setup_fname = base_dir+"data/latest_gtfs/gffcmp_multi.27M/gffcmp_multi.27M.setup.csv"
with open(setup_fname,"w+") as setup_fp:
    setup_fp.write("27M,"+base_dir+"data/latest_gtfs/27M.primary.gtf\n")
    setup_fp.write("gencode,"+base_dir+"data/latest_gtfs/gencode.primary.gtf\n")
    setup_fp.write("refseq,"+base_dir+"data/latest_gtfs/refseq.primary.gtf\n")
#     setup_fp.write("refgen,"+base_dir+"data/latest_gtfs/refgen.primary.gtf\n")
    setup_fp.write("MANE,"+base_dir+"data/latest_gtfs/MANE.primary.gtf\n")

gcm_cmd = [base_dir+"soft/scripts/genomic_scripts/gffcmp_multi/gffcmp_multi.py",
           "-s",base_dir+"data/latest_gtfs/gffcmp_multi.27M/gffcmp_multi.27M.setup.csv",
           "-o",base_dir+"data/latest_gtfs/gffcmp_multi.27M/gffcmp_multi.27M",
           "--keep-tmp"]
subprocess.call(gcm_cmd)

In [14]:
# build a known set across all tissues using refseq and gencode only

def get_tids(fname,class_code="="):
    tids = dict()
    with open(fname,"r") as inFP:
        for line in inFP:
            if line[0]=="#":
                continue
                
            cols=  line.split("\t")
            if not cols[2]=="transcript":
                continue

            code = cols[8].split("class_code \"",1)[1].split("\"",1)[0]
            if not code==class_code:
                continue

            tid = cols[8].split("cmp_ref \"",1)[1].split("\"",1)[0]
            known_id = cols[8].split("transcript_id \"",1)[1].split("\"",1)[0]
            tids.setdefault(tid,(list(),set()))
            tids[tid][0].append(known_id)
            if "gene_name" in cols[8]:
                known_name = cols[8].split("gene_name \"",1)[1].split("\"",1)[0]
                tids[tid][1].add(known_name)
            
    return tids

fnames = [(base_dir+"data/latest_gtfs/gffcmp_multi.27M/gffcmp_multi.27M27M_gencode.annotated.gtf","GENCODE"),
          (base_dir+"data/latest_gtfs/gffcmp_multi.27M/gffcmp_multi.27M27M_refseq.annotated.gtf","RefSeq")]

same_tids = dict()

for fname,name in fnames:
    tmp = get_tids(fname)
    for tid,kids in tmp.items():
        same_tids.setdefault(tid,dict())
        same_tids[tid][name] = (set(kids[0]),set(kids[1]))
    print(fname.split("/")[-1],len(tmp),len(same_tids))
    
    
with open(base_dir+"data/latest_gtfs/gffcmp_multi.27M/27M.known.refseq_gencode.gtf","w+") as outFP:
    write_exons = False
    with open(base_dir+"data/latest_gtfs/27M.primary.gtf","r") as inFP:
        for line in inFP:
            if line.startswith("#"):
                outFP.write(line)
                continue
            
            cols = line.split("\t")
            if cols[2]=="transcript":
                tid = cols[8].split("transcript_id \"",1)[1].split("\"",1)[0]
                if tid in same_tids:
                    line = line.rstrip("\n").rstrip(";")+";"
                    line+=" source \""+",".join(list(same_tids[tid]))+"\";"
                    for n,v in same_tids[tid].items():
                        line+=" "+n+"_ID \""+",".join(list(v[0]))+"\";"
                        if not len(v[1])==0:
                            line+=" "+n+"_gene_name \""+",".join(list(v[1]))+"\";"
                    outFP.write(line+"\n") 
                    write_exons=True
                else:
                    write_exons=False
            else:
                if (cols[2]=="exon" or cols[2]=="CDS") and write_exons:
                    outFP.write(line)

gffcmp_multi.27M27M_gencode.annotated.gtf 142241 142241
gffcmp_multi.27M27M_refseq.annotated.gtf 132677 221995
