In [1]:
import pandas as pd
import numpy as np
import subprocess
import random
from scipy import stats
import pickle
import glob
import math
import csv
import sys
import os
import re

import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.size'] = 24
%matplotlib inline

In [3]:
pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 1

sys.path.insert(0, "./soft")
%aimport definitions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
base_dir = "./"
data_dir = base_dir+"data/"
chess3_data_dir = data_dir+"chess3/"

gff3cols = ["seqid","source","type","start","end","score","strand","phase","attributes"]

In [5]:
chess30_gtf_fname = chess3_data_dir+"chess3.0.gtf"
chess31_gtf_fname = chess3_data_dir+"chess3.1.gtf"
chess31_gff_fname = chess3_data_dir+"chess3.1.gff"

chess30_chm13_gtf_fname = chess3_data_dir+"chess3.0.CHM13.gtf"
chess31_chm13_gtf_fname = chess3_data_dir+"chess3.1.CHM13.gtf"
chess31_chm13_gff_fname = chess3_data_dir+"chess3.1.CHM13.gff"

chess2_gtf_fname = data_dir+"chess2.2.gtf"
all_gtf_fname = data_dir+"all.gtf"
tb_gtf_fname = data_dir+"assembled.gtf"
refseq_gtf_fname = data_dir+"refseq.gtf" # 110 - same as preprint
refseq_gff_fname = data_dir+"refseq.gff"
gencode_gtf_fname = data_dir+"gencode.gtf" # 41 - same as preprint

chess2_chess3_mappings_fname = "./chess2_and_chess3.mappings"

In [24]:
# get latest release from https://github.com/chess-genome/chess/archive/refs/tags/v.3.0.zip

In [25]:
def extract_attributes(attribute_str:str,gff=False)->dict: # extract attribute key values into dictionary
    attrs = attribute_str.rstrip().rstrip(";").split(";")
    attrs = [x.strip() for x in attrs]
    attrs = [x.strip("\"") for x in attrs]
    attrs_dict = dict()
    sep = " \""
    if gff:
        sep = "="
    for at in attrs:
        k,v = at.split(sep)
        attrs_dict.setdefault(k,v)
        
    return attrs_dict

def rename_attributes(attrs:dict)->dict: # renames attributes
    rename_dict = {"max_tpm":"max_TPM",
                   "num_samples":"sample_count"}
    res_dict = {}
    for k,v in attrs.items():
        if k in rename_dict:
            res_dict[rename_dict[k]]=v
        else:
            res_dict[k]=v
    return res_dict

def convert_source(source: str,db_xref: bool, is_mane: bool)->str: # take line in and replace source according to hard-coded dictionary
    kvs = {"BestRefSeq":     "RefSeq",
           "Curated Genomic":"RefSeq",
           "RefSeq":         "RefSeq",
           "Gnomon":         "RefSeq",
           "tRNAscan-SE":    "RefSeq",
           "cmsearch":       "RefSeq",
           "ENSEMBL":        "GENCODE",
           "HAVANA":         "GENCODE",
           "ensembl":        "GENCODE",
           "ensembl_havana": "GENCODE",
           "havana":         "GENCODE",
           "FANTOM":         "CHESS",
           "StringTie":      "CHESS",
           "Liftoff":        "Liftoff"}
    
    if is_mane:
        return "MANE"
    
    if not db_xref: # anything without "db_xref" needs to be assigned CHESS type
        return "CHESS"
    
    assert source in kvs,"unknown source: "+source
    return kvs[source]

def get_source_conversion(in_gtf_fname: str) -> dict():
    res = dict()
    with open(in_gtf_fname,"r") as inFP:
        for line in inFP:
            if line[0]=="#": # comment found
                outFP.write(line)

            lcs = line.split("\t")
            if not len(lcs) == 9:
                continue
    
            if not lcs[2]=="transcript":
                continue

            attrs = extract_attributes(lcs[8])
            attrs = rename_attributes(attrs)

            # convert source
            is_mane = False
            if "tag" in attrs:
                is_mane = "mane" in attrs["tag"].lower()
            new_source = convert_source(lcs[1],"db_xref" in attrs,is_mane)
            res[attrs["transcript_id"]] = new_source
            
    return res

def to_attribute_string(attrs:dict,gff=False,feature_type=None)->str: # converts attribute key values back into string
    order = ["ID","Parent","transcript_id","gene_id","gene_name","gene_type","db_xref","description","max_TPM","sample_count","assembly_id","tag"]
    res = ""
    sep = " "
    quote = "\""
    end = "; "
    if gff:
        assert feature_type in ["gene","transcript","exon","CDS"],"wrong type: "+str(feature_type)
        sep = "="
        quote = ""
        end = ";"
        
    for k in order:
        if k in attrs:
            if gff:
                assert ";" not in attrs[k],"invalid character in attribute: "+attrs[k]
            
            if gff and 
            
            if gff and feature_type=="gene" and k=="transcript_id":
                continue
            elif gff and feature_type=="gene" and k=="gene_id":
                res+="ID="+quote+attrs[k]+quote+end
            elif gff and feature_type=="transcript" and k=="transcript_id":
                res+="ID="+quote+attrs[k]+quote+end
            elif gff and feature_type=="transcript" and k=="gene_id":
                res+="Parent="+quote+attrs[k]+quote+end
            elif gff and feature_type in ["exon","CDS"] and k=="transcript_id":
                res+="Parent="+quote+attrs[k]+quote+end
            elif gff and feature_type in ["exon","CDS"] and k=="gene_id":
                continue
            else:        
                res+=k+sep+quote+attrs[k]+quote+end
    
    # add any other attributes in sorted order
    for k in sorted(list(attrs)):
        if k not in order:
            if gff:
                assert ";" not in attrs[k],"invalid character in attribute: "+attrs[k]
            res+=k.lower()+sep+quote+attrs[k]+quote+end
    
    if not gff:
        res = res.rstrip()
    if gff:
        res = res.rstrip(";")
    return res
    
def round_tpm(tpm_str:str)->str:
    tpm = 0
    try:
        tpm = float(tpm_str)
    except:
        print("unable to convert TPM: "+tpm_str)
    
    if tpm<0.01:
        return tpm_str
    
    if tpm<0.1:
        return str(round(tpm,2))
        
    if tpm<5:
        return str(round(tpm,1))
    
    else:
        return str(round(tpm))
    
    
def get_comments()->str:
    comment_str = "##NAME: CHESS\n"+\
                  "##VERSION: 3.01\n"+\
                  "##DESCRIPTION: Comprehensive Human Expressed SequenceS\n"+\
                  "##FORMAT: GTF\n"+\
                  "##CONTACT: ales[dot]varabyou[at]jhu[dot][edu],mpertea[at]jhu[dot][edu]\n"
    return comment_str

def get_assembly_id(attrs:dict)->str:
    if "assembly_id" in attrs:
        return attrs["assembly_id"]
    else:
        return "NA"
        

def gtf_fixes(in_gtf_fname:str,out_gtf_fname:str): # apply fixes to the input GTF and write to output
    
    # collect a dictionary of tids to sources
    tid2source = get_source_conversion(in_gtf_fname)
    
    assert os.path.exists(in_gtf_fname),"Input GTF file not found: "+in_gtf_fname
    
    # get transcript ids to exclude
    tids_to_remove = definitions.get_attribute(in_gtf_fname,"gene_type")
    tids_to_remove = set(tids_to_remove[tids_to_remove["gene_type"]=="other"]["tid"])
    print("number of transcripts being removed: "+str(len(tids_to_remove)))

    with open(out_gtf_fname,"w+") as outFP:
        outFP.write(get_comments())
        with open(in_gtf_fname,"r") as inFP:
            for line in inFP:
                if line[0]=="#": # comment found
                    outFP.write(line)
                    
                lcs = line.split("\t")
                if not len(lcs) == 9:
                    continue
                    
                attrs = extract_attributes(lcs[8])
                attrs = rename_attributes(attrs)
                if attrs["transcript_id"] in tids_to_remove:
                    continue
                
                # setup copies for output
                res_lcs = lcs[:8]
                res_attrs = attrs
                
                # convert source
                res_lcs[1] = tid2source[attrs["transcript_id"]]
                
                # add source information to the attributes
                if lcs[2]=="transcript":
                    res_attrs["original_source"]=lcs[1]
                
                    # round TPM
                    if "max_TPM" in attrs:
                        res_attrs["max_TPM"] = round_tpm(res_attrs["max_TPM"])
                        
                    res_attrs["assembly_id"] = get_assembly_id(res_attrs)
                
                if lcs[2] in ["CDS","exon"]: # remove extra attribtues
                    for k in list(res_attrs):
                        if k not in ["transcript_id","gene_id","gene_name"]:
                            del res_attrs[k]
                    
                    
                res_line = "\t".join(res_lcs)+"\t"+to_attribute_string(res_attrs)
                outFP.write(res_line+"\n")

In [26]:
gtf_fixes(chess30_gtf_fname,chess31_gtf_fname)

number of transcripts being removed: 24


In [27]:
# we need to remove converted gene records from refseq
gene_names = definitions.get_attribute(chess30_gtf_fname,["gene_name","db_xref","assembly_id","gene_type","gene_id"])
gene_names["refseq"] = gene_names["db_xref"].str.split("RefSeq:",expand=True)[1].str.split(",",expand=True)[0]

tmp = definitions.get_chains(chess30_gtf_fname,"exon",True)
tmp["exon_count"] = tmp["chain"].apply(lambda x: len(x))
gene_names = gene_names.merge(tmp[["tid","exon_count"]],on="tid",how="left")
gene_names = gene_names[(gene_names["gene_name"]==gene_names["refseq"])&\
                        (gene_names["exon_count"]==1)&\
                        (gene_names["assembly_id"].isin(["-","NA"]))].reset_index(drop=True)
gene_names[["gene_type","tid"]].groupby(by="gene_type").count()

Unnamed: 0_level_0,tid
gene_type,Unnamed: 1_level_1
other,16
pseudogene,16521


In [28]:
pseudogene_set = set(gene_names[gene_names["gene_type"]=="pseudogene"]["gene_id"])

In [29]:
# get  gene names from the refseq gtf
refseq_gene_desc = definitions.get_attribute(refseq_gff_fname,["Name","description"],cols=None,feature="gene",gff=True)
refseq_gene_desc = refseq_gene_desc[refseq_gene_desc["description"]!="-"][["Name","description"]].reset_index(drop=True)
refseq_gene_desc.drop_duplicates(inplace=True)
refseq_gene_desc_dict = refseq_gene_desc.set_index('Name').to_dict()['description']
refseq_gene_desc.head()

Unnamed: 0,Name,description
0,MIR6859-1,microRNA 6859-1
1,MIR1302-2,microRNA 1302-2
2,FAM138A,family with sequence similarity 138 member A
3,OR4F5,olfactory receptor family 4 subfamily F member 5
4,LOC729737,uncharacterized LOC729737


In [30]:
# add gene records to gtf

# collect all transcripts by their gene_id

genes = dict()

with open(chess31_gtf_fname,"r") as inFP:
    for line in inFP:
        if line[0]=="#": # comment found
            continue

        lcs = line.split("\t")
        if not len(lcs) == 9:
            continue

        attrs = extract_attributes(lcs[8])
        gid = attrs["gene_id"]
        tid = attrs["transcript_id"]
        
        if lcs[2]=="transcript":
            genes.setdefault(gid,[dict({"seqid":None,
                                        "start":sys.maxsize,
                                         "end":0,
                                         "strand":None,
                                         "name":set(),
                                         "type":set()}),dict()])
            genes[gid][0]["start"] = min(genes[gid][0]["start"],int(lcs[3]))
            genes[gid][0]["end"] = max(genes[gid][0]["end"],int(lcs[4]))
            
            if not genes[gid][0]["seqid"] is None:
                assert genes[gid][0]["seqid"] == lcs[0],"wrong seqid: "+gid
            else:
                genes[gid][0]["seqid"] = lcs[0]

            assert lcs[6] in ["+","-"],"wrong strand: "+tid+" : "+lcs[6]
            if not genes[gid][0]["strand"] is None:
                if genes[gid][0]["strand"] != lcs[6]:
                    genes[gid][0]["strand"] = "."
            else:
                genes[gid][0]["strand"] = lcs[6]

            if "gene_name" in attrs:
                genes[gid][0]["name"].add(attrs["gene_name"])

            if "gene_type" in attrs:
                genes[gid][0]["type"].add(attrs["gene_type"])

            genes[gid][1].setdefault(tid,"")
            
        assert tid in genes[gid][1],"wrong tid (unsorted?): "+tid
        genes[gid][1][tid]+=line
        
        
# get minimum and maximum coordinates
       
with open(chess31_gtf_fname+".with_genes.gtf","w+") as outFP:
    # write headers
    with open(chess31_gtf_fname,"r") as inFP:
        for line in inFP:
            if line[0]=="#": # comment found
                outFP.write(line)
            else: # break at the first non-header line
                break
            
    for gid,gv in genes.items():
        gline = gv[0]["seqid"]+"\t"+\
                    "CHESS"+"\t"+\
                    "gene"+"\t"+\
                    str(gv[0]["start"])+"\t"+\
                    str(gv[0]["end"])+"\t"+\
                    "."+"\t"+\
                    gv[0]["strand"]+"\t"+\
                    "."+"\t"+\
                    "gene_id \""+gid+"\";"
        
        
        if len(gv[0]["type"])>0:
            gline+=" gene_type \""+", ".join(list(gv[0]["type"]))+"\";"
        
        if len(gv[0]["name"])>0:
            gline+=" gene_name \""+", ".join(list(gv[0]["name"]))+"\";"
            desc = list()
            for n in gv[0]["name"]:
                if n in refseq_gene_desc_dict:
                    desc.append([n,refseq_gene_desc_dict[n]])
                    
            if len(desc)==1:
                gline+=" description \""+desc[0][1]+"\";"
            if len(desc)>1:
                print(gv[0]["name"])
                gline+=" description \""+", ".join([x[0]+" : "+x[1] for x in desc])+"\";"
                
        outFP.write(gline+"\n")        
        if not gid in pseudogene_set: # only write transcript features for non-pseudogene entries
            for tid,tv in gv[1].items():
                outFP.write(tv)

In [31]:
# work on attribtue stats

def extract_all_attributes(gtf_fname: str, feature_type: str, gff3: bool=False) -> dict:
    sep = " \""
    if gff3:
        sep = "="
    
    res = dict()
    
    assert os.path.exists(gtf_fname),"the GTF file not found: "+gtf_fname
    with open(gtf_fname,"r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if not len(lcs) == 9:
                continue
                
            if lcs[2] != feature_type:
                continue
                
            attrs = lcs[8].strip().strip(";").split(";")
            attrs = [x.strip() for x in attrs]
            attrs = [x.strip("\"") for x in attrs]
            for at in attrs:
                k,v = at.split(sep)
                res.setdefault(k,dict())
                res[k].setdefault(v,0)
                res[k][v]+=1
                
    return res

In [32]:
c3a_gtf = extract_all_attributes(chess31_gtf_fname,"transcript")
print(list(c3a_gtf))
print(c3a_gtf["gene_type"])
print(c3a_gtf["tag"])
print(c3a_gtf["original_source"])

['transcript_id', 'gene_id', 'gene_name', 'gene_type', 'db_xref', 'assembly_id', 'original_source', 'max_TPM', 'sample_count', 'tag']
{'transcribed_pseudogene': 1954, 'miRNA': 5151, 'lncRNA': 36356, 'pseudogene': 16521, 'protein_coding': 105328, 'snRNA': 159, 'snoRNA': 1251, 'antisense_RNA': 37, 'ncRNA': 28, 'tRNA': 643, 'TEC': 28, 'ncRNA_pseudogene': 1, 'misc_RNA': 83, 'V_segment': 342, 'rRNA': 40, 'C_region': 36, 'J_segment': 117, 'V_segment_pseudogene': 282, 'telomerase_RNA': 1, 'vault_RNA': 4, 'D_segment': 61, 'J_segment_pseudogene': 11, 'Y_RNA': 4, 'RNase_MRP_RNA': 1, 'scRNA': 4, 'RNase_P_RNA': 1, 'C_region_pseudogene': 7}
{'MANE_Select': 19043, 'partial': 1368, 'MANE_Select,duplicated_transcript': 77, 'duplicated_transcript': 175}
{'BestRefSeq': 72352, 'HAVANA': 8047, 'Curated Genomic': 16319, 'havana': 4137, 'Gnomon': 16555, 'ensembl_havana': 14981, 'cmsearch': 1184, 'StringTie': 33783, 'FANTOM': 318, 'tRNAscan-SE': 621, 'ensembl': 2, 'ENSEMBL': 115, 'RefSeq': 37}


In [33]:
# need to convert to GFF and other formats now
def gtf2gff(gtf_fname:str,gff_fname:str):
    with open(gff_fname,"w+") as outFP:
        outFP.write("##gff-version 3\n")
        outFP.write("#!gff-spec-version 1.21\n")
        with open(gtf_fname,"r") as inFP:
            for line in inFP:
                lcs = line.split("\t")
                if lcs[0]=="#":
                    outFP.write(line)
                if not len(lcs) == 9:
                    continue
                    
                attrs = extract_attributes(lcs[8])
                res_line = "\t".join(lcs[:-1])+"\t"+to_attribute_string(attrs,True,lcs[2])
                outFP.write(res_line+"\n")
                
gtf2gff(chess31_gtf_fname+".with_genes.gtf",chess31_gff_fname+".with_genes.gff")

In [39]:
# apply fixes and other corrections to the CHM13 annotation
gtf_fixes(chess30_chm13_gtf_fname,chess31_chm13_gtf_fname)

number of transcripts being removed: 14


In [None]:
gtf2gff(chess31_chm13_gtf_fname,chess31_chm13_gff_fname)
cmd = ["gffread","--keep-genes","--keep-comments","-F","-O","-o",chess31_chm13_gff_fname.rstrip(".gff")+".tmp.gff",chess31_chm13_gff_fname]
print(" ".join(cmd))
subprocess.call(cmd)
# rename unwanted tags
with open(chess31_chm13_gff_fname,"w+") as outFP:
    with open(chess31_chm13_gff_fname.rstrip(".gff")+".tmp.gff","r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if not len(lcs) == 9:
                outFP.write(line)
                continue
                
            attrs = lcs[8].rstrip().rstrip(";").split(";")
            attrs = [x.strip() for x in attrs]
            attrs = [x.strip("\"") for x in attrs]
            attrs = [x.split("=") for x in attrs]
            attr_str = ""
            for k,v in attrs:
                if k.lower() in ["name"]:
                    continue
                
                attr_str+=k+"="+v+";"
            attr_str.rstrip(";")
                
            res_line = "\t".join(lcs[:-1])+"\t"+attr_str
            outFP.write(res_line+"\n")

In [43]:
## convert to GTF including the gene features
with open(chess31_chm13_gtf_fname,"w+") as outFP:
    with open(chess31_chm13_gff_fname,"r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if lcs[0]=="#":
                if "gff-version" in lcs[0]:
                    continue
                if "gff-spec-version" in lcs[0]:
                    continue
                outFP.write(line)
            if not len(lcs) == 9:
                continue

            attrs = extract_attributes(lcs[8],True)
            res_attrs = dict()
            for k,v in attrs.items():
                if k.lower() in ["parent","id"]:
                    if k.lower()=="parent" and lcs[2]=="gene":
                        print("error: "+attrs)
                        continue
                    if k.lower()=="parent" and lcs[2]=="transcript":
                        res_attrs["gene_id"] = v
                    if k.lower()=="parent" and lcs[2] in ["CDS","exon"]:
                        res_attrs["transcript_id"] = v

                    if k.lower()=="id" and lcs[2]=="gene":
                        res_attrs["gene_id"] = v
                    if k.lower()=="id" and lcs[2]=="transcript":
                        res_attrs["transcript_id"] = v
                    if k.lower()=="id" and lcs[2] in ["CDS","exon"]:
                        continue
                else:
                    res_attrs[k]=v
                
            res_line = "\t".join(lcs[:-1])+"\t"+to_attribute_string(res_attrs,False,lcs[2])
            outFP.write(res_line+"\n")

In [44]:
os.environ['LD_LIBRARY_PATH'] = "/ccb/sw/lib/:LD_LIBRARY_PATH"

In [21]:
# generate bedplus files
cmd = ["gff3ToGenePred",chess31_gff_fname+".with_genes.gff",chess31_gff_fname.rstrip(".gff")+".genePred"]
print(" ".join(cmd))
subprocess.call(cmd)

cmd = ["genePredToBigGenePred",chess31_gff_fname.rstrip(".gff")+".genePred",chess31_gff_fname.rstrip(".gff")+".bedPlus"]
print(" ".join(cmd))
subprocess.call(cmd)

cmd = ["bedSort",chess31_gff_fname.rstrip(".gff")+".bedPlus",chess31_gff_fname.rstrip(".gff")+".srt.bedPlus"]
print(" ".join(cmd))
subprocess.call(cmd)

os.rename(chess31_gff_fname.rstrip(".gff")+".srt.bedPlus",chess31_gff_fname.rstrip(".gff")+".bedPlus")

# cmd = ["wget","http://genome.ucsc.edu/goldenPath/help/examples/bigGenePred.as"]
# print(" ".join(cmd))
# subprocess.call(cmd)

cmd = ["bedToBigBed","-type=bed12+8","-tab","-as=bigGenePred.as",chess31_gff_fname.rstrip(".gff")+".bedPlus",chess3_data_dir+"hs38DH.len",chess31_gff_fname.rstrip(".gff")+".bb","-extraIndex=name"]
print(" ".join(cmd))
subprocess.call(cmd)

bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as /ccb/salz8-1/avaraby/chess3_rerun_31102021/reviews/data/chess3/chess3.1.bedPlus /ccb/salz8-1/avaraby/chess3_rerun_31102021/reviews/data/chess3/hs38DH.len /ccb/salz8-1/avaraby/chess3_rerun_31102021/reviews/data/chess3/chess3.1.bb -extraIndex=name


pass1 - making usageList (307 chroms): 86 millis
pass2 - checking and writing primary data (151915 records, 20 fields): 1326 millis
Sorting and writing extra index 0: 34 millis


0

In [None]:
# same for CHM13
# generate bedplus files
cmd = ["gff3ToGenePred",chess31_chm13_gff_fname,chess31_chm13_gff_fname.rstrip(".gff")+".genePred"]
print(" ".join(cmd))
subprocess.call(cmd)

cmd = ["genePredToBigGenePred",chess31_chm13_gff_fname.rstrip(".gff")+".genePred",chess31_chm13_gff_fname.rstrip(".gff")+".bedPlus"]
print(" ".join(cmd))
subprocess.call(cmd)

cmd = ["bedSort",chess31_chm13_gff_fname.rstrip(".gff")+".bedPlus",chess31_chm13_gff_fname.rstrip(".gff")+".srt.bedPlus"]
print(" ".join(cmd))
subprocess.call(cmd)

os.rename(chess31_chm13_gff_fname.rstrip(".gff")+".srt.bedPlus",chess31_chm13_gff_fname.rstrip(".gff")+".bedPlus")

cmd = ["bedToBigBed","-type=bed12+8","-tab","-as=bigGenePred.as",chess31_chm13_gff_fname.rstrip(".gff")+".bedPlus",chess3_data_dir+"CHM13.len",chess31_chm13_gff_fname.rstrip(".gff")+".bb","-extraIndex=name"]
print(" ".join(cmd))
subprocess.call(cmd)

In [13]:
# create release
rdir = base_dir+"chess3.0.1/"
if not os.path.exists(rdir):
    os.makedirs(rdir)
    
os.rename(chess31_gtf_fname,rdir+"chess3.0.1.gtf")
os.rename(chess31_gff_fname,rdir+"chess3.0.1.gff")
os.rename(chess31_gff_fname.rstrip(".gff")+".bb",rdir+"chess3.0.1.bb")


os.rename(chess31_chm13_gtf_fname,rdir+"chess3.0.1.CHM13.gtf")
os.rename(chess31_chm13_gff_fname,rdir+"chess3.0.1.CHM13.gff")
os.rename(chess31_chm13_gff_fname.rstrip(".gff")+".bb",rdir+"chess3.0.1.CHM13.bb")

In [16]:
# generate files with no alts
with open(rdir+"chess3.0.1.primary.gtf","w+") as outFP:
    with open(rdir+"chess3.0.1.gtf","r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if lcs[0]=="#":
                outFP.write(line)
            if not len(lcs) == 9:
                continue
                
            if "_alt"  in lcs[0]:
                continue
            
            outFP.write(line)
            
            
            
with open(rdir+"chess3.0.1.primary.gff","w+") as outFP:
    with open(rdir+"chess3.0.1.gff","r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if lcs[0]=="#":
                outFP.write(line)
            if not len(lcs) == 9:
                continue
                
            if "_alt"  in lcs[0]:
                continue
            
            outFP.write(line)

In [None]:
# generate protein file
cmd = ["gffread","-g","hg38_p12_ucsc.fa","-y",rdir+"chess3.0.1.protein.fa",rdir+"chess3.0.1.primary.gtf"]
print(" ".join(cmd))
subprocess.call(cmd)

In [None]:
# Generate a version of MANE with chess identifiers:
# due to the polycistronic genes, we need a better method of linking mane IDs to chess IDs...
chess3_attrs = definitions.get_attribute(chess3_gtf_fname,"db_xref",[1])
chess3_attrs.columns = ["tid","source","db_xref"]
chess3_attrs.head()


c3_mane = chess3_attrs[chess3_attrs["source"]=="MANE"].reset_index(drop=True)
# extract refseq and gencode identifiers from the db_xref field
c3_mane["rtid"] = c3_mane["db_xref"].str.split("RefSeq:",n=1,expand=True)[1].str.split(",",n=1,expand=True)[0]
c3_mane["gtid"] = c3_mane["db_xref"].str.split("GENCODE:",n=1,expand=True)[1].str.split(",",n=1,expand=True)[0]
mane_ctids = set(c3_mane["tid"])
print(len(mane_ctids))
c3_mane.head()


# now make sure all rtids and gtids are in the actual mane files
mane_df = definitions.get_attribute(mane_refseq_gtf_fname,"gene_id")
mane_df["tid"] = mane_df["tid"].str.split("-",n=1,expand=True)[1]
mane_rtids = set(mane_df["tid"])
print(len(mane_rtids))

assert mane_rtids==set(c3_mane["rtid"]),"mismatch in tids between chess mane and mane"


# load map of chess ids to mane ids
r2c = pd.Series(c3_mane.tid.values,index=c3_mane.rtid).to_dict()
r2db_xref = pd.Series(c3_mane.db_xref.values,index=c3_mane.rtid).to_dict()



# now we can write the mane file with chess transcript and gene ids
# we shall also keep the refseq and gencode identifiers in the db_xref attribute
with open(mane_chess_gtf_fname,"w+") as outFP:
    with open(mane_refseq_gtf_fname,"r") as inFP:
        for line in inFP:
            if line[0] == "#":
                outFP.write(line)
            lcs = line.strip().split('\t')
            tid = lcs[8].split("transcript_id \"", 1)[1].split("\"", 1)[0].split("-",1)[1]
            assert tid in c2r,"invalid tid: "+tid
            
            attrs = definitions.extract_attributes(lcs[8])
            attrs["db_xref"] = r2db_xref[tid]
            attrs["transcript_id"] = r2c[tid]
            attrs["gene_id"] = r2c[tid].rsplit(".",1)[0]
            
            attr_str = definitions.to_attribute_string(attrs)
            
            lcs[8] = attr_str
            
            outFP.write("\t".join(lcs)+"\n")