In [3]:
import os
import re
import sys
import csv
import pandas as pd
import numpy as np

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
version="2.1"

In [4]:
df=pd.read_csv("./chess2.05.gff",sep="\t",names=gff3Cols)
df=df[df["type"]=="transcript"].reset_index(drop=True)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)

setAttributes_known=set()
setAttributes_novel=set()

def addToSet_known(row):
    global setAttributes_known
    setAttributes_known=setAttributes_known.union(set([x.split("=")[0] for x in row.attributes.split(";")]))
    
def addToSet_novel(row):
    global setAttributes_novel
    setAttributes_novel=setAttributes_novel.union(set([x.split("=")[0] for x in row.attributes.split(";")]))

df_known=df[~(df["source"].isin(["StringTie","FANTOM"]))].reset_index(drop=True)
df_novel=df[df["source"].isin(["StringTie","FANTOM"])].reset_index(drop=True)
df_known.apply(lambda row: addToSet_known(row),axis=1)
print(setAttributes_known)
df_novel.apply(lambda row: addToSet_novel(row),axis=1)
print(setAttributes_novel)

# now to investigate the difference between lncRNA and protein-coding transcripts in terms of the attribute composition

# first load the transcripts file
transcripts=pd.read_csv("./chess2.0.transcripts",sep="\t")
display(transcripts.head())

print(set(transcripts["Gene_type"]))
setLnc=set()
setProt=set()
def addTransLnc(row):
    global setLnc
    for t in row.TranscriptID.split(","):
        setLnc.add(t)

def addTransProt(row):
    global setProt
    for t in row.TranscriptID.split(","):
        setProt.add(t)

transcripts[transcripts["Gene_type"]=="lncRNA"].apply(lambda row: addTransLnc(row),axis=1)
transcripts[transcripts["Gene_type"]=="protein_coding"].apply(lambda row: addTransProt(row),axis=1)
print(list(setLnc)[:10])

df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df_lnc=df[df['id'].isin(setLnc)].reset_index(drop=True)
df_prot=df[df['id'].isin(setProt)].reset_index(drop=True)
display(df_lnc.tail())
display(df_prot.tail())

setAttributes_lnc=set()
setAttributes_prot=set()

def addToSet_lnc(row):
    global setAttributes_lnc
    setAttributes_lnc=setAttributes_lnc.union(set([x.split("=")[0] for x in row.attributes.split(";")]))
    
def addToSet_prot(row):
    global setAttributes_prot
    setAttributes_prot=setAttributes_prot.union(set([x.split("=")[0] for x in row.attributes.split(";")]))

df_lnc.apply(lambda row: addToSet_lnc(row),axis=1)
print(setAttributes_lnc)
df_prot.apply(lambda row: addToSet_prot(row),axis=1)
print(setAttributes_prot)

print(setAttributes_lnc-setAttributes_prot)
print(setAttributes_prot-setAttributes_lnc)

# investigate where lncRNA appears in attributes
setLncRNA_tags=set()
def getLncRNATags(row):
    global setLncRNA_tags
    atts=row.attributes.split("ncRNA")
    for a in atts[:-1]:
        found=re.findall(r"\w+=$",a)
        for f in found:
            setLncRNA_tags.add(f)
df_lncRNA=df[df["attributes"].str.contains("lncRNA")].reset_index(drop=True)
df_lncRNA.apply(lambda row: getLncRNATags(row),axis=1)
# df_lncRNA["lncRNA_tag"]=df_lncRNA.attributes.str.split("lncRNA",expand=True)[0].str.extract(";\w+=$",expand=True)[1]
print(setLncRNA_tags)

# do the same for protein_coding

setProt_tags=set()
def getProtTags(row):
    global setProt_tags
    atts=row.attributes.split("protein_coding")
    for a in atts[:-1]:
        found=re.findall(r"\w+=$",a)
        for f in found:
            setProt_tags.add(f)
df_Prot=df[df["attributes"].str.contains("protein_coding")].reset_index(drop=True)
df_Prot.apply(lambda row: getProtTags(row),axis=1)
print(setProt_tags)

{'geneID', 'comment', 'gene_biotype', 'transcript_id', 'model_evidence', 'Parent', 'exception', 'ID', 'SIMILAR_TO', 'description', 'gbkey', 'TYPE', 'Dbxref', 'end_range', 'STATUS', 'transcript_support_level', 'ncrna_class', 'product', 'gene_status', 'gene_synonym', 'transcript_status', 'protein_id', 'Name', 'level', 'Note', 'gene_type', 'ASSEMBLED', 'havana_gene', 'partial', 'gene_name', 'pseudo', 'ccdsid', 'tag', 'havana_transcript', 'transcript_type', 'start_range'}
{'FANTOM_coding_status', 'comment', 'FANTOM_phylocsf_call', 'NR_coding_status', 'Parent', 'ID', 'SIMILAR_TO', 'FANTOM_functional_evidence', 'description', 'FANTOM_ribo_call', 'TYPE', 'SP_coding_status', 'FANTOM_rnacode_call', 'STATUS', 'ASSEMBLED', 'RIBO_coding_status'}


Unnamed: 0,TranscriptID,CHESS_gene,Gene_name,Gene_type,Gene_Database,GeneID,Transcript_Database,Location,No_of_exons,Transcript_Length
0,CHS.45778.5,CHS.45778,ZSCAN16,protein_coding,RefSeq,80345,RefSeq,chr6:28124556-28130086:+,4,1336
1,CHS.45778.2,CHS.45778,ZSCAN16,protein_coding,RefSeq,80345,RefSeq,chr6:28124556-28127572:+,3,1330
2,CHS.45778.4,CHS.45778,ZSCAN16,protein_coding,RefSeq,80345,RefSeq,chr6:28124556-28130086:+,4,1394
3,CHS.45778.3,CHS.45778,ZSCAN16,protein_coding,RefSeq,80345,RefSeq,chr6:28124556-28130086:+,4,1386
4,CHS.45778.8,CHS.45778,-,protein_coding,RefSeq,-,FANTOM,chr6:28124622-28129805:+,3,1724


{'lncRNA', 'antisense_RNA', 'misc_RNA', 'protein_coding'}
['CHS.12888.1', 'CHS.6037.4', 'CHS.10214.1', 'CHS.12835.6', 'CHS.43009.3', 'CHS.19604.1', 'CHS.36508.8', 'CHS.313.7', 'CHS.57417.4', 'CHS.3341.3']


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,id
49303,KN196486.1,BestRefSeq,transcript,8758,10915,.,+,.,ID=CHS.59946.1;ASSEMBLED=no;Parent=CHS.59946;D...,CHS.59946.1
49304,KN196486.1,Gnomon,transcript,15437,17657,.,+,.,ID=CHS.59948.1;ASSEMBLED=no;Parent=CHS.59948;D...,CHS.59948.1
49305,KQ759761.1,BestRefSeq,transcript,934,3090,.,+,.,ID=CHS.59954.1;ASSEMBLED=no;Parent=CHS.59954;D...,CHS.59954.1
49306,KQ759762.1,BestRefSeq,transcript,71945,74286,.,-,.,ID=CHS.59960.1;ASSEMBLED=no;Parent=CHS.59960;D...,CHS.59960.1
49307,KN196487.1,Gnomon,transcript,31953,33940,.,-,.,ID=CHS.59962.1;ASSEMBLED=no;Parent=CHS.59962;D...,CHS.59962.1


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,id
267463,KQ759761.1,Gnomon,transcript,25184,30339,.,+,.,ID=CHS.59956.26;ASSEMBLED=no;Parent=CHS.59956;...,CHS.59956.26
267464,KQ759761.1,BestRefSeq,transcript,51431,106747,.,-,.,ID=CHS.59958.1;ASSEMBLED=no;Parent=CHS.59958;D...,CHS.59958.1
267465,KQ759761.1,BestRefSeq,transcript,51431,106745,.,-,.,ID=CHS.59958.2;ASSEMBLED=no;Parent=CHS.59958;D...,CHS.59958.2
267466,KQ759762.1,BestRefSeq,transcript,10731,69328,.,+,.,ID=CHS.59959.1;ASSEMBLED=no;Parent=CHS.59959;D...,CHS.59959.1
267467,KQ759762.1,BestRefSeq,transcript,74341,81410,.,+,.,ID=CHS.59961.1;ASSEMBLED=no;Parent=CHS.59961;D...,CHS.59961.1


{'FANTOM_coding_status', 'comment', 'geneID', 'gene_biotype', 'NR_coding_status', 'transcript_id', 'model_evidence', 'Parent', 'exception', 'ID', 'SIMILAR_TO', 'description', 'gbkey', 'TYPE', 'Dbxref', 'end_range', 'SP_coding_status', 'STATUS', 'transcript_support_level', 'ncrna_class', 'RIBO_coding_status', 'product', 'FANTOM_phylocsf_call', 'gene_status', 'gene_synonym', 'transcript_status', 'protein_id', 'Name', 'level', 'FANTOM_functional_evidence', 'gene_type', 'ASSEMBLED', 'Note', 'FANTOM_ribo_call', 'havana_gene', 'partial', 'gene_name', 'pseudo', 'FANTOM_rnacode_call', 'ccdsid', 'tag', 'havana_transcript', 'transcript_type', 'start_range'}
{'FANTOM_coding_status', 'comment', 'geneID', 'gene_biotype', 'NR_coding_status', 'transcript_id', 'model_evidence', 'Parent', 'exception', 'ID', 'SIMILAR_TO', 'description', 'gbkey', 'TYPE', 'Dbxref', 'end_range', 'SP_coding_status', 'STATUS', 'transcript_support_level', 'ncrna_class', 'RIBO_coding_status', 'product', 'FANTOM_phylocsf_call',

In [5]:
# now to investigate attributes of protein coding genes vs lncRNA
df=pd.read_csv("./chess2.05.gff",sep="\t",names=gff3Cols)
df=df[df["type"]=="gene"].reset_index(drop=True)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)

genes=pd.read_csv("./chess2.0.genes",sep="\t")
display(genes.head())

print(set(genes["Gene_Type"]))
setLnc=set()
setProt=set()
def addGenesLnc(row):
    global setLnc
    for t in row.GFF_ID.split(","):
        setLnc.add(t)

def addGenesProt(row):
    global setProt
    for t in row.GFF_ID.split(","):
        setProt.add(t)

genes[genes["Gene_Type"]=="lncRNA"].apply(lambda row: addGenesLnc(row),axis=1)
genes[genes["Gene_Type"]=="protein_coding"].apply(lambda row: addGenesProt(row),axis=1)
print(list(setLnc)[:10])

df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df_lnc=df[df['id'].isin(setLnc)].reset_index(drop=True)
df_prot=df[df['id'].isin(setProt)].reset_index(drop=True)
display(df_lnc.tail())
display(df_prot.tail())

setAttributes_lnc=set()
setAttributes_prot=set()

def addToSet_lnc(row):
    global setAttributes_lnc
    setAttributes_lnc=setAttributes_lnc.union(set([x.split("=")[0] for x in row.attributes.split(";")]))
    
def addToSet_prot(row):
    global setAttributes_prot
    setAttributes_prot=setAttributes_prot.union(set([x.split("=")[0] for x in row.attributes.split(";")]))

df_lnc.apply(lambda row: addToSet_lnc(row),axis=1)
print(setAttributes_lnc)
df_prot.apply(lambda row: addToSet_prot(row),axis=1)
print(setAttributes_prot)

print(setAttributes_lnc-setAttributes_prot)
print(setAttributes_prot-setAttributes_lnc)

# investigate where lncRNA appears in attributes
setLncRNA_tags=set()
def getLncRNATags(row):
    global setLncRNA_tags
    atts=row.attributes.split("lncRNA")
    for a in atts[:-1]:
        found=re.findall(r"\w+=$",a)
        for f in found:
            setLncRNA_tags.add(f)
df_lncRNA=df[df["attributes"].str.contains("lncRNA")].reset_index(drop=True)
df_lncRNA.apply(lambda row: getLncRNATags(row),axis=1)
# df_lncRNA["lncRNA_tag"]=df_lncRNA.attributes.str.split("lncRNA",expand=True)[0].str.extract(";\w+=$",expand=True)[1]
print(setLncRNA_tags)

# do the same for protein_coding

setProt_tags=set()
def getProtTags(row):
    global setProt_tags
    atts=row.attributes.split("protein_coding")
    for a in atts[:-1]:
        found=re.findall(r"\w+=$",a)
        for f in found:
            setProt_tags.add(f)
df_Prot=df[df["attributes"].str.contains("protein_coding")].reset_index(drop=True)
df_Prot.apply(lambda row: getProtTags(row),axis=1)
print(setProt_tags)

Unnamed: 0,Gene_Type,Gene_Name,GFF_ID,Location,Database,RefSeq_GeneID,Description
0,protein_coding,LOC105371921,CHS.23496,chr17:80885776-80892131:-,RefSeq,105371921,uncharacterized LOC105371921
1,lncRNA,LOC107986127,CHS.38280,chr3:127744219-127751505:-,RefSeq,107986127,uncharacterized LOC107986127
2,protein_coding,RPS11,CHS.27254,chr19:49496365-49499712:+,RefSeq,6205,ribosomal protein S11
3,protein_coding,CREB3L1,CHS.8579,chr11:46276634-46321430:+,RefSeq,90993,cAMP responsive element binding protein 3-like 1
4,lncRNA,LOC101930053,CHS.54970,chr9:2385141-2494223:-,RefSeq,101930053,uncharacterized LOC101930053


{'lncRNA', 'antisense_RNA', 'misc_RNA', 'protein_coding'}
['CHS.58729', 'CHS.19424', 'CHS.31101', 'CHS.27079', 'CHS.49695', 'CHS.30986', 'CHS.13150', 'CHS.50753', 'CHS.16710', 'CHS.10799']


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,id
19499,KN196486.1,RefSeq,gene,8758,10915,.,+,.,ID=CHS.59946;GENE_TYPE=lncRNA;STATUS=known_ref...,CHS.59946
19500,KN196486.1,RefSeq,gene,15437,17657,.,+,.,ID=CHS.59948;GENE_TYPE=lncRNA;STATUS=known_ref...,CHS.59948
19501,KQ759761.1,RefSeq,gene,934,3090,.,+,.,ID=CHS.59954;GENE_TYPE=lncRNA;STATUS=known_ref...,CHS.59954
19502,KQ759762.1,RefSeq,gene,71945,74286,.,-,.,ID=CHS.59960;GENE_TYPE=lncRNA;STATUS=known_ref...,CHS.59960
19503,KN196487.1,RefSeq,gene,31953,33940,.,-,.,ID=CHS.59962;GENE_TYPE=lncRNA;STATUS=known_ref...,CHS.59962


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,id
23829,KQ759761.1,RefSeq,gene,4240,8622,.,-,.,ID=CHS.59955;GENE_TYPE=protein_coding;STATUS=k...,CHS.59955
23830,KQ759761.1,RefSeq,gene,6604,30680,.,+,.,ID=CHS.59956;GENE_TYPE=protein_coding;STATUS=k...,CHS.59956
23831,KQ759761.1,RefSeq,gene,51431,106747,.,-,.,ID=CHS.59958;GENE_TYPE=protein_coding;STATUS=k...,CHS.59958
23832,KQ759762.1,RefSeq,gene,10731,69328,.,+,.,ID=CHS.59959;GENE_TYPE=protein_coding;STATUS=k...,CHS.59959
23833,KQ759762.1,RefSeq,gene,74341,81410,.,+,.,ID=CHS.59961;GENE_TYPE=protein_coding;STATUS=k...,CHS.59961


{'GENE_NAME', 'STATUS', 'ID', 'GENE_TYPE', 'description', 'product', 'GENCODE_GENE_NAME'}
{'GENE_NAME', 'STATUS', 'ID', 'GENE_TYPE', 'description', 'GENCODE_GENE_NAME'}
{'product'}
set()
{'description=', 'GENE_TYPE='}
{'GENE_TYPE='}


In [6]:
genesDF=pd.read_csv("./chess2.0.novel_protein.genes.conservative.txt",sep="\t")
transDF=pd.read_csv("./chess2.0.novel_protein.transcripts.conservative.txt",sep="\t")
display(genesDF.head())
display(transDF.head())

genesDiscard=set(genesDF[genesDF['Status']=='discard']['Gene_ID']) # set of all geneIDs that need to be removed from the annotation
genesC=set(genesDF[genesDF['Status']=='coding']['Gene_ID']) # set of all coding genes - used to verify that they all are annotated as coding and that they all have corresponding CDS
genesNC=set(genesDF[genesDF['Status']=='non-coding']['Gene_ID']) # set of all geneIDs that need to be labeled as lncRNA

transDiscard=set(transDF[transDF['Status']=='discard']['Transcript_ID'])
transC=set(transDF[transDF['Status']=='coding']['Transcript_ID'])
transNC=set(transDF[transDF['Status']=='non-coding']['Transcript_ID'])

# need to make sure the geneIDS extracted from transDF of discard, NC and C match the corresponding ones from genesDF
assert(len(genesDiscard.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transDiscard])))==0),"wrong discard"
assert(len(genesC.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transC])))==0),"wrong coding"
assert(len(genesNC.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transNC])))==0),"wrong non-coding"

outFP=open("./chess2.1.gff","w+")

with open("./chess2.05.gff","r") as inFP:
    for line in inFP.readlines():
        if line[0]=="#":
            if line[:7]=="##CHESS":
                outFP.write("##CHESS"+version+"\n")
            else:
                outFP.write(line)
            continue
        else:
            lineCols=line.split("\t")
            if lineCols[2]=="gene":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                gid=attributes["ID"]
                if gid in genesDiscard: # do not write since gene is the targeted for discarding
                    continue
                elif gid in genesNC: # need to alter the attributes in a way that makes sense
                    attributes["GENE_TYPE"]="lncRNA"
                    newLine=""
                    for att in attOrder:
                        newLine+=att+"="+attributes[att]+";"
                    newLine=newLine[:-1] # remove trailing ';'
                    outFP.write("\t".join(lineCols[:-1]+[newLine+"\n"]))
                    continue
                else:
                    if attributes["GENE_TYPE"]=="protein_coding" and attributes["STATUS"]=="novel": # coding - make sure it is in the genesC set
                        assert(gid in genesC),"coding protein found; not in genesC"
                    outFP.write(line)
                    continue
            elif lineCols[2]=="transcript":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["ID"]
                if tid in transDiscard: # do not write since gene is the targeted for discarding
                    assert("CHS."+tid.split(".")[1] in genesDiscard),"wrong transcript marked to be discarded"
                    continue
                elif tid in transNC: # need to alter the attributes in a way that makes sense
                    assert("CHS."+tid.split(".")[1] in genesNC),"wrong transcript marked to be changed to protein-coding"
                    if "GENE_TYPE" in attributes:
                        attributes.pop("GENE_TYPE",None)
                    if "gbkey" in attributes:
                        attributes["gbkey"]="lncRNA"
                    if "gene_biotype" in attributes:
                        attributes["gene_biotype"]="lncRNA"
                    if "transcript_type" in attributes:
                        attributes.pop('transcript_type',None)
                    if "gene_type" in attributes:
                        attributes.pop('gene_type',None)
                    attributes["TYPE"]="non_coding"
                    newLine=""
                    for att in attOrder:
                        newLine+=att+"="+attributes[att]+";"
                    newLine=newLine[:-1] # remove trailing ';'
                    outFP.write("\t".join(lineCols[:-1]+[newLine+"\n"]))
                    continue
                else:
                    outFP.write(line)
                    continue
            elif lineCols[2]=="exon":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["Parent"]
                if tid in transDiscard: # do not write since gene is the targeted for discarding
                    continue
                else:
                    outFP.write(line)
                    continue
            elif lineCols[2]=="CDS":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["Parent"]
                if tid in transDiscard or tid in transNC: # do not write since gene is the targeted for discarding
                    continue
                else:
                    outFP.write(line)
                    continue
            else:
                print('something is wrong with the type of a record: '+line)
                

inFP.close()
outFP.close()

Unnamed: 0,Status,Gene_ID,Location,NR_blast_Eval,NR_hit,SP_blast_Eval,SP_hit
0,discard,CHS.10012,chr11:113875694-113899029:+,1e-17,"CAE91339.1, unnamed protein product [Homo sapi...",-,-
1,non-coding,CHS.10056,chr11:116307531-116308521:+,3e-22,"CAE89781.1, unnamed protein product [Homo sapi...",-,-
2,non-coding,CHS.1008,chr1:29124945-29139372:+,4e-19,"EAW78405.1, hCG2021310 [Homo sapiens]",-,-
3,non-coding,CHS.10158,chr11:119057437-119059239:+,4.0000000000000003e-81,"BAC85254.1, unnamed protein product [Homo sapi...",-,-
4,discard,CHS.10242,chr11:121958716-121961024:+,2e-16,"AAL23680.1, non-small cell lung cancer RimL3b ...",-,-


Unnamed: 0,Status,Transcript_ID,Gene_ID,Location,Transcript_Length,No_of_exons,No_of_samples,Average_TPM,Maximum_TPM,NR_Eval,...,SP_Eval,SP_hit,CDD_Eval,Pfam_Eval,CDSlen,CDSreps,CDSrepeatcov,ALU_cdcov,LINE_cdcov,GencodeCall
0,coding,CHS.55578.5,CHS.55578,chr9:61672749-61674874:-,1705,2,4,1.195119,1.687937,6e-63,...,-,-,0.00837726,-,-,-,-,-,-,-
1,non-coding,CHS.20892.1,CHS.20892,chr17:722531-732316:+,9786,1,1354,3.256216,22.569984,9e-18,...,-,-,-,-,333,"Simple_repeat:49,SINE|Alu:248",89.2,74.5,0,SINE/Alu;Simple_repeat
2,non-coding,CHS.24098.4,CHS.24098,chr18:8974487-8998422:+,2451,3,187,5.632762,28.178564,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu
3,non-coding,CHS.24098.3,CHS.24098,chr18:8974487-8980729:+,436,2,59,3.200027,9.097818,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu
4,non-coding,CHS.24098.2,CHS.24098,chr18:8974487-8979364:+,454,2,17,2.209827,3.991817,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu


In [7]:
genesDF=pd.read_csv("./chess2.0.novel_protein.genes.conservative.txt",sep="\t")
transDF=pd.read_csv("./chess2.0.novel_protein.transcripts.conservative.txt",sep="\t")
display(genesDF.head())
display(transDF.head())

genesDiscard=set(genesDF[genesDF['Status']=='discard']['Gene_ID']) # set of all geneIDs that need to be removed from the annotation
genesC=set(genesDF[genesDF['Status']=='coding']['Gene_ID']) # set of all coding genes - used to verify that they all are annotated as coding and that they all have corresponding CDS
genesNC=set(genesDF[genesDF['Status']=='non-coding']['Gene_ID']) # set of all geneIDs that need to be labeled as lncRNA

transDiscard=set(transDF[transDF['Status']=='discard']['Transcript_ID'])
transC=set(transDF[transDF['Status']=='coding']['Transcript_ID'])
transNC=set(transDF[transDF['Status']=='non-coding']['Transcript_ID'])

# need to make sure the geneIDS extracted from transDF of discard, NC and C match the corresponding ones from genesDF
assert(len(genesDiscard.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transDiscard])))==0),"wrong discard"
assert(len(genesC.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transC])))==0),"wrong coding"
assert(len(genesNC.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transNC])))==0),"wrong non-coding"

outFP=open("./chess2.1_and_refseq.gff","w+")

version="2.1"

with open("./chess2.05_and_refseq.gff","r") as inFP:
    for line in inFP.readlines():
        if line[0]=="#":
            if line[:7]=="##CHESS":
                outFP.write("##CHESS"+version+"\n")
            else:
                outFP.write(line)
            continue
        else:
            lineCols=line.split("\t")
            if lineCols[2]=="gene":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                gid=attributes["ID"]
                if gid in genesDiscard: # do not write since gene is the targeted for discarding
                    continue
                elif gid in genesNC: # need to alter the attributes in a way that makes sense
                    attributes["GENE_TYPE"]="lncRNA"
                    newLine=""
                    for att in attOrder:
                        newLine+=att+"="+attributes[att]+";"
                    newLine=newLine[:-1] # remove trailing ';'
                    outFP.write("\t".join(lineCols[:-1]+[newLine+"\n"]))
                    continue
                else:
                    if attributes["GENE_TYPE"]=="protein_coding" and attributes["STATUS"]=="novel": # coding - make sure it is in the genesC set
                        assert(gid in genesC),"coding protein found; not in genesC"
                    outFP.write(line)
                    continue
            elif lineCols[2]=="transcript":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["ID"]
                if tid in transDiscard: # do not write since gene is the targeted for discarding
                    assert("CHS."+tid.split(".")[1] in genesDiscard),"wrong transcript marked to be discarded"
                    continue
                elif tid in transNC: # need to alter the attributes in a way that makes sense
                    assert("CHS."+tid.split(".")[1] in genesNC),"wrong transcript marked to be changed to protein-coding"
                    if "GENE_TYPE" in attributes:
                        attributes.pop("GENE_TYPE",None)
                    if "gbkey" in attributes:
                        attributes["gbkey"]="lncRNA"
                    if "gene_biotype" in attributes:
                        attributes["gene_biotype"]="lncRNA"
                    if "transcript_type" in attributes:
                        attributes.pop('transcript_type',None)
                    if "gene_type" in attributes:
                        attributes.pop('gene_type',None)
                    attributes["TYPE"]="non_coding"
                    newLine=""
                    for att in attOrder:
                        newLine+=att+"="+attributes[att]+";"
                    newLine=newLine[:-1] # remove trailing ';'
                    outFP.write("\t".join(lineCols[:-1]+[newLine+"\n"]))
                    continue
                else:
                    outFP.write(line)
                    continue
            elif lineCols[2]=="exon":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["Parent"]
                if tid in transDiscard: # do not write since gene is the targeted for discarding
                    continue
                else:
                    outFP.write(line)
                    continue
            elif lineCols[2]=="CDS":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["Parent"]
                if tid in transDiscard or tid in transNC: # do not write since gene is the targeted for discarding
                    continue
                else:
                    outFP.write(line)
                    continue
            else:
                outFP.write(line)
                

inFP.close()
outFP.close()

Unnamed: 0,Status,Gene_ID,Location,NR_blast_Eval,NR_hit,SP_blast_Eval,SP_hit
0,discard,CHS.10012,chr11:113875694-113899029:+,1e-17,"CAE91339.1, unnamed protein product [Homo sapi...",-,-
1,non-coding,CHS.10056,chr11:116307531-116308521:+,3e-22,"CAE89781.1, unnamed protein product [Homo sapi...",-,-
2,non-coding,CHS.1008,chr1:29124945-29139372:+,4e-19,"EAW78405.1, hCG2021310 [Homo sapiens]",-,-
3,non-coding,CHS.10158,chr11:119057437-119059239:+,4.0000000000000003e-81,"BAC85254.1, unnamed protein product [Homo sapi...",-,-
4,discard,CHS.10242,chr11:121958716-121961024:+,2e-16,"AAL23680.1, non-small cell lung cancer RimL3b ...",-,-


Unnamed: 0,Status,Transcript_ID,Gene_ID,Location,Transcript_Length,No_of_exons,No_of_samples,Average_TPM,Maximum_TPM,NR_Eval,...,SP_Eval,SP_hit,CDD_Eval,Pfam_Eval,CDSlen,CDSreps,CDSrepeatcov,ALU_cdcov,LINE_cdcov,GencodeCall
0,coding,CHS.55578.5,CHS.55578,chr9:61672749-61674874:-,1705,2,4,1.195119,1.687937,6e-63,...,-,-,0.00837726,-,-,-,-,-,-,-
1,non-coding,CHS.20892.1,CHS.20892,chr17:722531-732316:+,9786,1,1354,3.256216,22.569984,9e-18,...,-,-,-,-,333,"Simple_repeat:49,SINE|Alu:248",89.2,74.5,0,SINE/Alu;Simple_repeat
2,non-coding,CHS.24098.4,CHS.24098,chr18:8974487-8998422:+,2451,3,187,5.632762,28.178564,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu
3,non-coding,CHS.24098.3,CHS.24098,chr18:8974487-8980729:+,436,2,59,3.200027,9.097818,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu
4,non-coding,CHS.24098.2,CHS.24098,chr18:8974487-8979364:+,454,2,17,2.209827,3.991817,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu


In [8]:
genesDF=pd.read_csv("./chess2.0.novel_protein.genes.conservative.txt",sep="\t")
transDF=pd.read_csv("./chess2.0.novel_protein.transcripts.conservative.txt",sep="\t")
display(genesDF.head())
display(transDF.head())

genesDiscard=set(genesDF[genesDF['Status']=='discard']['Gene_ID']) # set of all geneIDs that need to be removed from the annotation
genesC=set(genesDF[genesDF['Status']=='coding']['Gene_ID']) # set of all coding genes - used to verify that they all are annotated as coding and that they all have corresponding CDS
genesNC=set(genesDF[genesDF['Status']=='non-coding']['Gene_ID']) # set of all geneIDs that need to be labeled as lncRNA

transDiscard=set(transDF[transDF['Status']=='discard']['Transcript_ID'])
transC=set(transDF[transDF['Status']=='coding']['Transcript_ID'])
transNC=set(transDF[transDF['Status']=='non-coding']['Transcript_ID'])

# need to make sure the geneIDS extracted from transDF of discard, NC and C match the corresponding ones from genesDF
assert(len(genesDiscard.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transDiscard])))==0),"wrong discard"
assert(len(genesC.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transC])))==0),"wrong coding"
assert(len(genesNC.symmetric_difference(set(["CHS."+t.split(".")[1] for t in transNC])))==0),"wrong non-coding"

outFP=open("./chess2.1_assembly.gff","w+")

version="2.1"

with open("./chess2.05_assembly.gff","r") as inFP:
    for line in inFP.readlines():
        if line[0]=="#":
            if line[:7]=="##CHESS":
                outFP.write("##CHESS"+version+"\n")
            else:
                outFP.write(line)
            continue
        else:
            lineCols=line.split("\t")
            if lineCols[2]=="gene":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                gid=attributes["ID"]
                if gid in genesDiscard: # do not write since gene is the targeted for discarding
                    continue
                elif gid in genesNC: # need to alter the attributes in a way that makes sense
                    attributes["GENE_TYPE"]="lncRNA"
                    newLine=""
                    for att in attOrder:
                        newLine+=att+"="+attributes[att]+";"
                    newLine=newLine[:-1] # remove trailing ';'
                    outFP.write("\t".join(lineCols[:-1]+[newLine+"\n"]))
                    continue
                else:
                    if attributes["GENE_TYPE"]=="protein_coding" and attributes["STATUS"]=="novel": # coding - make sure it is in the genesC set
                        assert(gid in genesC),"coding protein found; not in genesC"
                    outFP.write(line)
                    continue
            elif lineCols[2]=="transcript":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["ID"]
                if tid in transDiscard: # do not write since gene is the targeted for discarding
                    assert("CHS."+tid.split(".")[1] in genesDiscard),"wrong transcript marked to be discarded"
                    continue
                elif tid in transNC: # need to alter the attributes in a way that makes sense
                    assert("CHS."+tid.split(".")[1] in genesNC),"wrong transcript marked to be changed to protein-coding"
                    if "GENE_TYPE" in attributes:
                        attributes.pop("GENE_TYPE",None)
                    if "gbkey" in attributes:
                        attributes["gbkey"]="lncRNA"
                    if "gene_biotype" in attributes:
                        attributes["gene_biotype"]="lncRNA"
                    if "transcript_type" in attributes:
                        attributes.pop('transcript_type',None)
                    if "gene_type" in attributes:
                        attributes.pop('gene_type',None)
                    attributes["TYPE"]="non_coding"
                    newLine=""
                    for att in attOrder:
                        newLine+=att+"="+attributes[att]+";"
                    newLine=newLine[:-1] # remove trailing ';'
                    outFP.write("\t".join(lineCols[:-1]+[newLine+"\n"]))
                    continue
                else:
                    outFP.write(line)
                    continue
            elif lineCols[2]=="exon":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["Parent"]
                if tid in transDiscard: # do not write since gene is the targeted for discarding
                    continue
                else:
                    outFP.write(line)
                    continue
            elif lineCols[2]=="CDS":
                attTmp=lineCols[8].rstrip("\n").split(";")
                attributes=dict([tuple(x.split("=")) for x in attTmp])
                attOrder=[x.split("=")[0] for x in attTmp]
                tid=attributes["Parent"]
                if tid in transDiscard or tid in transNC: # do not write since gene is the targeted for discarding
                    continue
                else:
                    outFP.write(line)
                    continue
            else:
                outFP.write(line)
                

inFP.close()
outFP.close()

Unnamed: 0,Status,Gene_ID,Location,NR_blast_Eval,NR_hit,SP_blast_Eval,SP_hit
0,discard,CHS.10012,chr11:113875694-113899029:+,1e-17,"CAE91339.1, unnamed protein product [Homo sapi...",-,-
1,non-coding,CHS.10056,chr11:116307531-116308521:+,3e-22,"CAE89781.1, unnamed protein product [Homo sapi...",-,-
2,non-coding,CHS.1008,chr1:29124945-29139372:+,4e-19,"EAW78405.1, hCG2021310 [Homo sapiens]",-,-
3,non-coding,CHS.10158,chr11:119057437-119059239:+,4.0000000000000003e-81,"BAC85254.1, unnamed protein product [Homo sapi...",-,-
4,discard,CHS.10242,chr11:121958716-121961024:+,2e-16,"AAL23680.1, non-small cell lung cancer RimL3b ...",-,-


Unnamed: 0,Status,Transcript_ID,Gene_ID,Location,Transcript_Length,No_of_exons,No_of_samples,Average_TPM,Maximum_TPM,NR_Eval,...,SP_Eval,SP_hit,CDD_Eval,Pfam_Eval,CDSlen,CDSreps,CDSrepeatcov,ALU_cdcov,LINE_cdcov,GencodeCall
0,coding,CHS.55578.5,CHS.55578,chr9:61672749-61674874:-,1705,2,4,1.195119,1.687937,6e-63,...,-,-,0.00837726,-,-,-,-,-,-,-
1,non-coding,CHS.20892.1,CHS.20892,chr17:722531-732316:+,9786,1,1354,3.256216,22.569984,9e-18,...,-,-,-,-,333,"Simple_repeat:49,SINE|Alu:248",89.2,74.5,0,SINE/Alu;Simple_repeat
2,non-coding,CHS.24098.4,CHS.24098,chr18:8974487-8998422:+,2451,3,187,5.632762,28.178564,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu
3,non-coding,CHS.24098.3,CHS.24098,chr18:8974487-8980729:+,436,2,59,3.200027,9.097818,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu
4,non-coding,CHS.24098.2,CHS.24098,chr18:8974487-8979364:+,454,2,17,2.209827,3.991817,-,...,-,-,-,-,-,-,-,-,-,SINE/Alu


In [9]:
# lastly need to modify all three supplementary files: mapfile, genes and transcripts
outFP=open('./mapfile2.1.txt',"w+")
with open('./mapfile2.05.txt',"r") as inFP:
    for line in inFP.readlines():
        lineCols=line.split("\t")
        if lineCols[-1] in transDiscard or lineCols[-1] in genesDiscard:
            continue
        else:
            outFP.write(line)
            
inFP.close()
outFP.close()

In [18]:
outFP=open('./chess2.1.genes',"w+")
genesEvaluated=[]
wrongCoords=[]
header=True
with open('./chess2.05.genes','r') as inFP:
    for line in inFP.readlines():
        if header: # skip header
            outFP.write(line)
            header=False
            continue
        lineCols=line.split("\t")
        gids=lineCols[2].split(",")
        locs=lineCols[3].split(",")
        assert len(gids)==len(locs),"something is wrong with the following locus: "+lineCols[2]+"\t"+lineCols[3]
        newLocs=[]
        newGIDs=[]
        genesNC_flag=[]
        skip=False
        changeStatus=False
        for gl in zip(gids,locs):
            genesEvaluated.append(gl[0]) # append geneID for verification later
            if gl[0] in genesDiscard:
                skip=True
                continue
            elif gl[0] in genesNC: # if change of status - need to verify that all in that grouping have a change of status - otherwise a problem
                changeStatus=True
                assert lineCols[0]=="protein_coding","wrong status: "+lineCols[0]
                lineCols[0]="lncRNA"
                genesNC_flag.append(1)
            newGIDs.append(gl[0])
            newLocs.append(gl[1])
        if not skip:
            if changeStatus:
                assert len(genesNC_flag)==len(gids),"wrong change of status on a locus, maybe separate then?: "+",".join([str(x) for x in genesNC_flag])
            lineCols[3]=",".join(newLocs)
            lineCols[2]=",".join(newGIDs)
            outFP.write('\t'.join(lineCols))
        
inFP.close()
outFP.close()

In [22]:
outFP=open("./chess2.1.transcripts","w+")

transcriptsEvaluated=[]
wrongCoords=[]
header=True
with open("./chess2.05.transcripts") as inFP:
    for line in inFP.readlines():
        if header: # skip header
            outFP.write(line)
            header=False
            continue
        lineCols=line.split("\t")
        tid=lineCols[0]
        
        if tid in transDiscard:
            skip=True
        elif tid in transNC:
            assert lineCols[3]=="protein_coding","wrong status: "+tid
            lineCols[3]='lncRNA'
        outFP.write('\t'.join(lineCols))

inFP.close()
outFP.close()