In [1]:
import pandas as pd
import os
import glob
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

In [None]:
# generate gff of novel protein-coding genes
novelProteinGeneIDs=[]
with open("./chess2.0.novel_protein.genes","r") as fp:
    for line in fp.readlines():
        chsID=line.split("\t")[0]
        if chsID[:3]=="CHS":
            novelProteinGeneIDs.append(chsID)
            
outFP=open("chess2.0.novel_protein.gff","w+")
with open("./chess2.0.gff","r") as fp:
    for line in fp.readlines():
        if line[0]=="#":
            continue
        lineCols=line.split("\t")
        curAttributes=lineCols[-1].rstrip("\n")
        curType=lineCols[2]
        if curType=="gene":
            curGeneID=curAttributes.split(";")[0].lstrip("ID=")
            if curGeneID in novelProteinGeneIDs:
                outFP.write(line)
        elif curType=="transcript":
            curGeneID=curAttributes.split("Parent=")[-1].split(";")[0]
            if curGeneID in novelProteinGeneIDs:
                outFP.write(line)
        elif curType=="exon":
            curTranscriptID=curAttributes.split("Parent=")[-1].split(";")[0]
            curGeneID=".".join(curTranscriptID.split(".")[:-1])
            if curGeneID in novelProteinGeneIDs:
                outFP.write(line)
        elif curType=="CDS":
            curTranscriptID=curAttributes.split("Parent=")[-1].split(";")[0]
            curGeneID=".".join(curTranscriptID.split(".")[:-1])
            if curGeneID in novelProteinGeneIDs:
                outFP.write(line)
        else:
            print("wrong type: "+curType)
            break

In [None]:
# load mapfile
mapDF=pd.read_csv("./mapfile.txt",sep="\t",names=["name","ID"])

# load novel gff
gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
df=pd.read_csv("./chess2.0.novel_protein.gff",names=gff3Cols,sep="\t")

# load fantom fantom/NR/SP matches
extra=pd.read_csv("./fantom/ALL.intergenic.transcripts.extra",sep='\t')
extra["ID"]=extra.Fantom_id.str.split("|",expand=True)[1]
extra["fantom_orfID"]=extra.Fantom_id.str.split("|",expand=True)[0]

In [None]:
# now also select only those novel protein_coding genes that are in FANTOM
dfFantom=df[(df["attributes"].str.contains("STATUS=known_fantom"))&(df["type"]=="gene")].reset_index(drop=True)
dfFantom["ID"]=dfFantom["attributes"].str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
setGeneIds=set(dfFantom["ID"])
df["Parent_full"]=df["attributes"].str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["Parent"]="CHS."+df["Parent_full"].str.extract('(\d+)',expand=True)
dfFantomCDS=df[(df["Parent"].isin(setGeneIds))&(df["type"]=="CDS")].reset_index(drop=True)
dfFantomCDS["start"]=dfFantomCDS["start"].astype(int)
dfFantomCDS["end"]=dfFantomCDS["end"].astype(int)
# now merge in the the information from the mapfile
dfFantomCDS=dfFantomCDS.merge(mapDF,how="left",left_on="Parent_full",right_on="ID").drop("ID",axis=1)
dfFantomCDS=dfFantomCDS[~(dfFantomCDS["name"].str.contains("ALL_"))]
dfFantomCDS=dfFantomCDS.sort_values(by="Parent").reset_index(drop=True)
dfFantomCDS=dfFantomCDS.drop_duplicates(["name","Parent"]).reset_index(drop=True)
del dfFantom
del df

In [4]:
extraCols=["Name","Chr","Length","ExonNo","AvgTPM","MaxTPM","ORFlen","Fantom_start","Fantom_end","Fantom_code","Fantom_id","Fantom_nex","Fantom_sign","SP_eval","SP_len","SP_PE","SP_description","NR_eval","NR_len","NR_description","ID","fantom_orfID"]
dfFantomCDS=dfFantomCDS.merge(extra[extraCols],how="left",left_on="name",right_on="ID")
# do some cleanup first
dropCols=["source","type","score","phase","attributes","Parent","Chr","Fantom_nex","ID","Fantom_id"]
dfFantomCDS.drop(dropCols,axis=1,inplace=True)
dfFantomCDS.to_csv("./fantom/allFantom.csv",index=False)
dfFantomCDS

Unnamed: 0,seqid,start,end,strand,Parent_full,name,Name,Length,ExonNo,AvgTPM,MaxTPM,ORFlen,Fantom_start,Fantom_end,Fantom_code,Fantom_sign,SP_eval,SP_len,SP_PE,SP_description,NR_eval,NR_len,NR_description,fantom_orfID
0,chr11,119058814,119059239,+,CHS.10158.2,MICT00000068553.1,ALL_05193993,171,1,1.461500,1.461500,120,119057437,119059239,c,+,1.000000e+00,0,0,-,1.000000e+00,0,-,ENSG00000160695.10
1,chr11,119058814,119059239,+,CHS.10158.2,MICT00000068553.1,ALL_04530967,236,1,1.882305,1.882305,120,119057437,119059239,c,+,1.000000e+00,0,0,-,1.000000e+00,0,-,ENSG00000160695.10
2,chr11,119058814,119059239,+,CHS.10158.2,MICT00000068553.1,ALL_04220283,667,1,1.250316,2.265959,210,119057437,119059239,c,+,1.000000e+00,0,0,-,1.000000e+00,0,-,ENSG00000160695.10
3,chr11,119058814,119059239,+,CHS.10158.2,MICT00000068553.1,ALL_04220279,5848,2,1.360984,3.915396,588,119057437,119059239,=,+,1.000000e+00,0,0,-,4.000000e-81,198,"BAC85254.1, unnamed protein product [Homo sapiens]",ENSG00000160695.10
4,chr12,49536886,49536896,-,CHS.11569.1,MICT00000078152.1,ALL_06325522,242,1,0.947227,0.947227,142,49536677,49538804,e,-,1.000000e+00,0,0,-,1.000000e+00,0,-,CATG00000011894.1
5,chr12,49536886,49536896,-,CHS.11569.1,MICT00000078152.1,ALL_06325521,221,1,6.389284,6.389284,0,49536677,49538804,o,-,1.000000e+00,0,0,-,1.000000e+00,0,-,CATG00000011894.1
6,chr12,49536886,49536896,-,CHS.11569.1,MICT00000078152.1,ALL_06325520,183,2,5.469804,5.469804,0,49536677,49538804,o,-,1.000000e+00,0,0,-,1.000000e+00,0,-,CATG00000011894.1
7,chr12,49536886,49536896,-,CHS.11569.1,MICT00000078152.1,ALL_05970551,430,1,1.229456,1.368492,142,49536677,49538804,e,-,1.000000e+00,0,0,-,1.000000e+00,0,-,CATG00000011894.1
8,chr12,49536886,49536896,-,CHS.11569.1,MICT00000078152.1,ALL_05970550,659,1,0.964054,1.287600,195,49536677,49538804,e,-,3.850000e-19,195,2,"sp|Q8N7I0|GVQW1_HUMAN, Protein GVQW1 OS=Homo sapiens GN=GVQW1 PE=2 SV=1",1.000000e+00,0,-,CATG00000011894.1
9,chr12,49536886,49536896,-,CHS.11569.1,MICT00000078152.1,ALL_05970548,2791,2,0.521273,1.011657,405,49536677,49538804,=,-,2.460000e-34,174,5,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",9.000000e-44,103,"EHH59362.1, hypothetical protein EGM_09449, partial [Macaca fascicularis]",CATG00000011894.1


In [39]:
# now separate known_fantom into those that contain "=" fantom code and those that do not contain it
gffcompareListOrdered=["=","c","k","j","e","o","s","x","i","y","p","r","u"]
def getGreatest(row):
    if row["SP_eval"]<row["NR_eval"]:
        return row["SP_eval"]
    elif row["NR_eval"]<row["SP_eval"]:
        return row["NR_eval"]
    else:
        return row["NR_eval"]

noIntronChainMatchDF=pd.DataFrame([])
intronChainMatchDF=pd.DataFrame([])
for key in set(dfFantomCDS["name"]):
    tmp=dfFantomCDS[dfFantomCDS["name"]==key].reset_index(drop=True)
#     tmp["best_eval"]=tmp.apply(lambda row: getGreatest(row),axis=1)
#     tmp=tmp[tmp["best_eval"]<=float(1e-15)].reset_index(drop=True)
    tmp=tmp[~((tmp["NR_description"]=="-")&(tmp["SP_description"]=="-"))]
    if len(tmp)==0:
        print("no good evalues present in "+key)
        tmp=dfFantomCDS[dfFantomCDS["name"]==key].reset_index(drop=True)
    if not "=" in set(tmp["Fantom_code"]):
        noIntronChainMatchDF=pd.concat([noIntronChainMatchDF,tmp])
    else:
        intronChainMatchDF=pd.concat([intronChainMatchDF,tmp])
        tmp["Fantom_code"]=tmp["Fantom_code"].astype('category')
        tmp["Fantom_code"]=tmp["Fantom_code"].cat.set_categories(gffcompareListOrdered, ordered=True)
        tmp.sort_values(by=["Fantom_code"],inplace=True)
        tmp.reset_index(drop=True,inplace=True)
        tmp.to_csv("./fantom/res/eq_"+key+".csv",index=False)
noIntronChainMatchDF.reset_index(drop=True,inplace=True)
intronChainMatchDF.reset_index(drop=True,inplace=True)
noIntronChainMatchDF

no good evalues present in MICT00000158257.1


Unnamed: 0,seqid,start,end,strand,Parent_full,name,Name,Length,ExonNo,AvgTPM,MaxTPM,ORFlen,Fantom_start,Fantom_end,Fantom_code,Fantom_sign,SP_eval,SP_len,SP_PE,SP_description,NR_eval,NR_len,NR_description,fantom_orfID
0,chr9,35603546,35603770,-,CHS.55399.2,FTMT23400003453.1,ALL_28135477,3705,1,3.966533,22.643896,222,35602914,35604227,k,-,1.0,0,0,-,7e-40,74,"CAF16253.1, unnamed protein product, partial [Homo sapiens]",ENSG00000215187.5
1,chr6,31070084,31070563,+,CHS.45935.1,ENCT00000371074.1,ALL_23765749,3807,1,5.835164,37.819321,477,31068368,31069485,k,+,9.48e-23,174,5,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",9.999999999999999e-27,171,"CAD69816.1, unnamed protein product [Homo sapiens]",CATG00000083488.1
2,chr6,31070084,31070563,+,CHS.45935.1,ENCT00000371074.1,ALL_23557325,2431,1,3.406167,44.336052,477,31068368,31069485,k,+,9.48e-23,174,5,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",9.999999999999999e-27,171,"CAD69816.1, unnamed protein product [Homo sapiens]",CATG00000083488.1
3,chr2,87369744,87370007,+,CHS.30108.1,MICT00000193437.1,ALL_15321531,5703,2,3.380293,9.305055,261,87359247,87372067,o,+,1.0,0,0,-,6e-15,58,"CAF16784.1, unnamed protein product, partial [Homo sapiens]",CATG00000043739.1
4,chr2,87369744,87370007,+,CHS.30108.1,MICT00000193437.1,ALL_15321530,7309,1,2.827613,16.850754,261,87359247,87372067,c,+,1.0,0,0,-,7e-16,58,"CAF16784.1, unnamed protein product, partial [Homo sapiens]",CATG00000043739.1
5,chr2,87369744,87370007,+,CHS.30108.1,MICT00000193437.1,ALL_15321481,8463,2,2.418546,4.968382,261,87359247,87372067,o,+,1.0,0,0,-,6e-15,58,"CAF16784.1, unnamed protein product, partial [Homo sapiens]",CATG00000043739.1
6,chr2,87369744,87370007,+,CHS.30108.1,MICT00000193437.1,ALL_14572067,5779,2,2.862295,7.274134,261,87359247,87372067,o,+,1.0,0,0,-,6e-15,58,"CAF16784.1, unnamed protein product, partial [Homo sapiens]",CATG00000043739.1
7,chr2,87369744,87370007,+,CHS.30108.1,MICT00000193437.1,ALL_14572058,7451,2,2.936582,7.968475,159,87359247,87372067,o,+,1.0,0,0,-,4e-15,135,"BAC85286.1, unnamed protein product [Homo sapiens]",CATG00000043739.1
8,chr2,87369744,87370007,+,CHS.30108.1,MICT00000193437.1,ALL_14572042,8329,2,3.232645,9.655789,261,87359247,87372067,o,+,1.0,0,0,-,6e-15,58,"CAF16784.1, unnamed protein product, partial [Homo sapiens]",CATG00000043739.1
9,chr20,309825,310502,+,CHS.32671.3,FTMT28000000013.1,ALL_17105213,9631,2,3.912354,3.912354,675,310737,311245,k,+,1.0,0,0,-,1e-17,204,"ELK04063.1, Zinc finger CCHC domain-containing protein 3 [Pteropus alecto]",ENSG00000177764.6


In [40]:
# now lets create order dictionary for codes
gffcompareListOrdered=["=","c","k","j","e","o","s","x","i","y","p","r","u"]

resORF=pd.DataFrame([])
resFantom=pd.DataFrame([])
for key in set(noIntronChainMatchDF["name"]):
    # then we can order each FANTOM group with that ordering
    tmp=noIntronChainMatchDF[noIntronChainMatchDF["name"]==key].reset_index(drop=True)
    tmp["Fantom_code"]=tmp["Fantom_code"].astype('category')
    tmp["Fantom_code"]=tmp["Fantom_code"].cat.set_categories(gffcompareListOrdered, ordered=True)
    tmp.sort_values(by=["Fantom_code"],inplace=True)
    tmp.reset_index(drop=True,inplace=True)
    tmp.to_csv("./fantom/res/"+key+".csv",index=False)
    bestFantom=tmp[tmp["Fantom_code"]==list(tmp["Fantom_code"])[0]].reset_index(drop=True)
    bestFantom["best_eval"]=bestFantom.apply(lambda row: getGreatest(row),axis=1)
    bestFantom.sort_values(by="best_eval",ascending=True,inplace=True)
    bestFantom.reset_index(drop=True,inplace=True)
    resFantom=pd.concat([resFantom,bestFantom.head(1)]).reset_index(drop=True)
    
# resFantom["CDS_len"]=resFantom["end"]-resFantom["start"]
# the top one would represent the best comparison.

orf=pd.DataFrame([],columns=["Name","Gene_id","location","exons","segs","ORF"])

allIDs=set(dfFantomCDS["Name"])
with open("./fantom/ALL.intergenic_w_repeats.ORF.fa.ssv","r") as fp:
    for line in fp.readlines():
        allID=line.split(" ")[0].lstrip(">")
        if allID in allIDs:
            tmp=pd.DataFrame(line.rstrip("\n").split(" ")).T
            tmp.columns=["Name","Gene_id","location","exons","segs","ORF"]
            orf=pd.concat([orf,tmp])
orf.reset_index(drop=True,inplace=True)          
orf["Name"]=orf.Name.str.strip(">")
orf["Gene_id"]=orf["Gene_id"].str.split("=",expand=True)[1]
orf["Chr"]=orf["location"].str.split(":",expand=True)[1].str.split("|",expand=True)[0]
orf["assembled_start"]=orf["location"].str.split("|",expand=True)[1].str.split("-",expand=True)[0]
orf["assembled_end"]=orf["location"].str.split("|",expand=True)[1].str.split("-",expand=True)[1]
orf["ORFend"]=orf["ORF"].str.split(":",expand=True)[1].str.split("-",expand=True)[1].str.extract('(\d+)',expand=True).astype(int)
orf["ORFstart"]=orf["ORF"].str.split(":",expand=True)[1].str.split("-",expand=True)[0].astype(int)
orf.drop(["Chr","location","Gene_id"],axis=1,inplace=True)
orf["assembled_strand"]=np.where(orf['ORF'].str.contains("\(-\)"),"-","+")
orf["ORF"]=orf["ORF"].str.split("\(",expand=True)[0]

def getOrfStart(row):
    start=0
    end=0
    segs=row["segs"].split(":")[1].split(",")
    segPairs=[(int(x.split("-")[0]),int(x.split("-")[1])) for x in segs]
    exons=row["exons"].split(":")[1].split(",")
    exonStarts=[int(x.split("-")[0]) for x in exons]
    exonPairs=[(int(x.split("-")[0]),int(x.split("-")[1])) for x in exons]
    orfStart=int(row["ORF"].split(":")[1].split("-")[0])
    orfEnd=int(row["ORF"].split(":")[1].split("-")[1].rstrip("\n"))
    # first find where start is
    startSeg=0
    for sp in segPairs:
        if sp[1]>orfStart:
            break
        else:
            startSeg+=1
    endSeg=0
    for sp in segPairs:
        if sp[1]>orfEnd:
            break
        else:
            endSeg+=1
    # get start
    for ep in exonPairs:
        if ep[0]+orfStart>ep[1]:
            orfStart=(ep[0]+orfStart)-ep[1]
        else:
            start=ep[0]+orfStart
            break
    for ep in exonPairs:
        if ep[0]+orfEnd>ep[1]:
            end=ep[0]+orfEnd
            orfEnd=(ep[0]+orfEnd)-ep[1]
        else:
            end=ep[0]+orfEnd
            break
    return str(start)+":"+str(end)

orf["orf"]=orf.apply(lambda row: getOrfStart(row),axis=1)
orf["orf_start"]=orf["orf"].str.split(":",expand=True)[0].astype(int)
orf["orf_end"]=orf["orf"].str.split(":",expand=True)[1].astype(int)
orf.drop(["exons","segs","ORF","orf","ORFstart","ORFend"],axis=1,inplace=True)
orf.reset_index(drop=True,inplace=True)
orf

Unnamed: 0,Name,assembled_start,assembled_end,assembled_strand,orf_start,orf_end
0,ALL_02487065,55868394,55880304,+,55880105,55880308
1,ALL_02237442,55868878,55880999,+,55871533,55879794
2,ALL_02681925,55876059,55881153,+,55880921,55881154
3,ALL_01989129,55877504,55881330,+,55880921,55881166
4,ALL_02681936,55877926,55880312,+,55880102,55880305
5,ALL_02681940,55878325,55879779,+,55879531,55879659
6,ALL_01989137,55878661,55880311,+,55880102,55880305
7,ALL_01989140,55878896,55881039,+,55880102,55880305
8,ALL_01989147,55879087,55879961,+,55879826,55879961
9,ALL_00067510,55879158,55879450,+,55879298,55879318


In [41]:
resFantom_test=resFantom.merge(orf,how="left",left_on="Name",right_on="Name")
resFantom_test=resFantom_test.dropna(axis=0).reset_index(drop=True)
resFantom_test["start"]=resFantom_test["start"].astype(int)
resFantom_test["end"]=resFantom_test["end"].astype(int)
resFantom_test["assembled_start"]=resFantom_test["assembled_start"].astype(int)
resFantom_test["assembled_end"]=resFantom_test["assembled_end"].astype(int)
resFantom_test["outStart"]=np.where((resFantom_test["start"]-resFantom_test["assembled_start"])<0,True,False)
resFantom_test["outEnd"]=np.where((resFantom_test["assembled_end"]-resFantom_test["end"])<0,True,False)
resFantom_test[["Parent_full",
                "name",
                "Name",
                "start",
                "end",
                "assembled_start",
                "assembled_end",
                "Fantom_start",
                "Fantom_end",
                "outStart",
                "outEnd",
                "ExonNo",
                "AvgTPM",
                "MaxTPM",
                "Fantom_code",
                "SP_eval",
                "SP_description",
                "NR_eval",
                "NR_description"]]

Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.41347.3,ENCT00000324151.1,ALL_20809768,131726035,131726337,131725882,131726267,131725079,131726346,False,True,1,9.08366,33.762402,c,1.0,-,5e-09,"EAX02611.1, hCG2039337, isoform CRA_b, partial [Homo sapiens]"
1,CHS.23040.1,ENCT00000177721.1,ALL_11428472,68024692,68025033,68024584,68025054,68020112,68027582,False,False,1,1.333096,2.48969,c,1.0100000000000001e-29,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",1.0,-
2,CHS.3275.1,ENCT00000032688.1,ALL_00295396,155308176,155308409,155307802,155308663,155304469,155308581,False,False,1,9.520388,241.121918,c,1.0,-,9.999999999999999e-27,"BAB21923.1, hypothetical protein [Macaca fascicularis]"
3,CHS.50873.1,ENCT00000400721.1,ALL_25763102,73527683,73528081,73527487,73528488,73523599,73534522,False,False,1,4.832366,25.162094,c,1.96e-38,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",1e-49,"EHH64004.1, hypothetical protein EGM_17106, partial [Macaca fascicularis]"
4,CHS.55399.2,FTMT23400003453.1,ALL_28135477,35603546,35603770,35601573,35605277,35602914,35604227,False,False,1,3.966533,22.643896,k,1.0,-,7e-40,"CAF16253.1, unnamed protein product, partial [Homo sapiens]"
5,CHS.57705.1,ENCT00000465331.1,ALL_30255516,24314068,24314643,24315450,24315989,24311157,24319612,True,False,1,1.561953,1.561953,c,5.73e-18,"sp|Q96MD7|CI085_HUMAN, Uncharacterized protein C9orf85 OS=Homo sapiens GN=C9orf85 PE=1 SV=1",1.0,-
6,CHS.45935.1,ENCT00000371074.1,ALL_23765749,31070084,31070563,31068283,31072089,31068368,31069485,False,False,1,5.835164,37.819321,k,9.48e-23,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",9.999999999999999e-27,"CAD69816.1, unnamed protein product [Homo sapiens]"
7,CHS.14817.2,ENCT00000121847.1,ALL_07710630,113393386,113393763,113372202,113394075,113392611,113394084,False,False,1,10.239079,16.813429,k,3.8499999999999996e-20,"sp|Q8N7I0|GVQW1_HUMAN, Protein GVQW1 OS=Homo sapiens GN=GVQW1 PE=2 SV=1",6e-32,"BAC85397.1, unnamed protein product [Homo sapiens]"
8,CHS.43561.5,FTMT22000006434.1,ALL_22072742,108388531,108388803,108382257,108392896,108382182,108388439,False,False,1,2.972518,20.336881,k,8.32e-25,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",3e-37,"EHH18952.1, hypothetical protein EGK_19543, partial [Macaca mulatta]"
9,CHS.30108.1,MICT00000193437.1,ALL_15321530,87369744,87370007,87363766,87371074,87359247,87372067,False,False,1,2.827613,16.850754,c,1.0,-,7e-16,"CAF16784.1, unnamed protein product, partial [Homo sapiens]"


In [42]:
tmpDF=pd.read_csv("./fantom/res/HBMT00000157570.1.csv")
tmpDF=tmpDF.merge(orf,how="left",left_on="Name",right_on="Name")
tmpDF.dropna(axis=0,inplace=True)
tmpDF["start"]=tmpDF["start"].astype(int)
tmpDF["end"]=tmpDF["end"].astype(int)
tmpDF["assembled_start"]=tmpDF["assembled_start"].astype(int)
tmpDF["assembled_end"]=tmpDF["assembled_end"].astype(int)
tmpDF["outStart"]=np.where((tmpDF["start"]-tmpDF["assembled_start"])<0,True,False)
tmpDF["outEnd"]=np.where((tmpDF["assembled_end"]-tmpDF["end"])<0,True,False)
tmpDF=tmpDF[~((tmpDF["NR_description"]=="-")&(tmpDF["SP_description"]=="-"))]
tmpDF[["Parent_full",
                "name",
                "Name",
                "start",
                "end",
                "assembled_start",
                "assembled_end",
                "Fantom_start",
                "Fantom_end",
                "outStart",
                "outEnd",
                "ExonNo",
                "AvgTPM",
                "MaxTPM",
                "Fantom_code",
                "SP_eval",
                "SP_description",
                "NR_eval",
                "NR_description"]].sort_values(by=["Fantom_code","AvgTPM"],ascending=[True,False])

Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.7549.3,HBMT00000157570.1,ALL_03990780,130213856,130214029,130213526,130220172,130213537,130252017,False,False,2,1.782761,1.782761,j,7.69e-15,"sp|Q6UX73|CP089_HUMAN, UPF0764 protein C16orf89 OS=Homo sapiens GN=C16orf89 PE=1 SV=2",4e-17,"EHH19897.1, hypothetical protein EGK_02637, partial [Macaca mulatta]"


In [43]:
#=====================================================================
#=====================================================================
#=====================================================================
#=====================================================================
# NOW WE SHALL INVESTIGATE "=" transcripts
#=====================================================================
#=====================================================================
#=====================================================================
#=====================================================================
pd.set_option('display.max_rows', 100)

In [44]:
# first need to look for the low e-values
intronChainMatchDF["best_eval"]=intronChainMatchDF.apply(lambda row: getGreatest(row),axis=1)
intronChainMatchDF[intronChainMatchDF["Fantom_code"]=="="].sort_values(by=["Parent_full","best_eval"],ascending=False).reset_index(drop=True)

Unnamed: 0,seqid,start,end,strand,Parent_full,name,Name,Length,ExonNo,AvgTPM,MaxTPM,ORFlen,Fantom_start,Fantom_end,Fantom_code,Fantom_sign,SP_eval,SP_len,SP_PE,SP_description,NR_eval,NR_len,NR_description,fantom_orfID,best_eval
0,chr11,68650491,68651147,-,CHS.9318.11,FTMT24100044158.1,ALL_04553642,11312,2,7.854937,30.177153,690,68649432,68651813,=,-,9.28e-19,174,5,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",6e-33,123,"CAE91339.1, unnamed protein product [Homo sapiens]",CATG00000005975.1,6e-33
1,chr10,130213856,130214029,+,CHS.7549.3,HBMT00000157570.1,ALL_03990737,7972,2,1.695187,3.643304,381,130213537,130252017,=,+,1.0,0,0,-,1e-07,488,"XP_014691893.1, PREDICTED: putative malate dehydrogenase 1B isoform X2 [Equus asinus]",ENSG00000108010.7,1e-07
2,chr10,84575247,84575453,+,CHS.6673.1,HBMT00000148956.1,ALL_03971595,2843,2,2.451808,11.373755,216,84572172,84575819,=,+,1.0,0,0,-,2.0000000000000002e-29,90,"BAB21923.1, hypothetical protein [Macaca fascicularis]",ENSG00000107771.11,2.0000000000000002e-29
3,chr10,81874514,81875203,-,CHS.6643.2,FTMT23700040370.1,ALL_03370453,5193,2,1.680378,4.333646,687,81873287,81875051,=,-,1.0,0,0,-,9e-69,229,"XP_007960941.1, PREDICTED: atherin-like [Chlorocebus sabaeus]",CATG00000000263.1,9e-69
4,chr9,124211831,124212340,-,CHS.56715.5,MICT00000366249.1,ALL_28315443,2644,3,0.911213,1.48207,507,124211008,124222459,=,-,2.2600000000000002e-29,402,1,"sp|Q6UX73|CP089_HUMAN, UPF0764 protein C16orf89 OS=Homo sapiens GN=C16orf89 PE=1 SV=2",5e-40,99,"EHH24426.1, hypothetical protein EGK_08082, partial [Macaca mulatta]",CATG00000109742.1,5e-40
5,chr9,85453516,85453827,+,CHS.55909.1,HBMT00001466041.1,ALL_29221258,820,2,4.555582,13.709828,309,85454042,85454436,=,+,9.72e-26,402,1,"sp|Q6UX73|CP089_HUMAN, UPF0764 protein C16orf89 OS=Homo sapiens GN=C16orf89 PE=1 SV=2",1e-36,98,"BAE91091.1, unnamed protein product [Macaca fascicularis]",CATG00000105958.1,1e-36
6,chr9,38659876,38660151,+,CHS.55491.2,MICT00000358429.1,ALL_28812856,3788,3,2.574134,10.346997,273,38650193,38662719,=,+,9.35e-20,179,1,"sp|Q96MD7|CI085_HUMAN, Uncharacterized protein C9orf85 OS=Homo sapiens GN=C9orf85 PE=1 SV=1",1e-24,162,"EAW81162.1, hCG1814203 [Homo sapiens]",CATG00000105420.1,1e-24
7,chr9,38659876,38660151,+,CHS.55491.1,MICT00000358428.1,ALL_28354955,4260,2,7.80074,60.732929,273,38650193,38662719,=,+,9.35e-20,179,1,"sp|Q96MD7|CI085_HUMAN, Uncharacterized protein C9orf85 OS=Homo sapiens GN=C9orf85 PE=1 SV=1",1e-24,162,"EAW81162.1, hCG1814203 [Homo sapiens]",CATG00000105420.1,1e-24
8,chr9,29214710,29214979,+,CHS.55276.2,MICT00000356931.1,ALL_28467736,2192,2,1.59338,3.144931,267,29214303,29229151,=,+,1.0,0,0,-,8e-22,48,"BAE87920.1, unnamed protein product [Macaca fascicularis]",CATG00000105133.1,8e-22
9,chr9,19162128,19162209,+,CHS.55154.2,ENCT00000444459.1,ALL_28036175,1508,3,1.415213,3.013812,378,19155572,19189379,=,+,4.46e-15,174,5,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",5.0000000000000005e-22,98,"EHH15935.1, hypothetical protein EGK_02111, partial [Macaca mulatta]",CATG00000104889.1,5.0000000000000005e-22


In [51]:
# first let's find those that are out of bounds
displayCols=["Parent_full",
            "name",
            "Name",
            "start",
            "end",
            "assembled_start",
            "assembled_end",
            "Fantom_start",
            "Fantom_end",
            "outStart",
            "outEnd",
            "best_eval",
            "ExonNo",
            "AvgTPM",
            "MaxTPM",
            "Fantom_code",
            "SP_eval",
            "SP_description",
            "NR_eval",
            "NR_description"]

for dfFP in glob.glob("./fantom/res/eq_*.csv"):
    tmpDF=pd.read_csv(dfFP)
#     if tmpDF["Parent_full"].iloc[0] in ["CHS.45292.2","CHS.49781.3","CHS.44064.5","CHS.7549.3","CHS.32794.4","CHS.24098.4","CHS.31956.1","CHS.44064.5","CHS.52196.4"]:
#         continue
    tmpDF=tmpDF[tmpDF["Fantom_code"]=="="]
    tmpDF["best_eval"]=tmpDF.apply(lambda row: getGreatest(row),axis=1)
    tmpDF=tmpDF.merge(orf,how="left",left_on="Name",right_on="Name")
    tmpDF.dropna(axis=0,inplace=True)
    tmpDF["start"]=tmpDF["start"].astype(int)
    tmpDF["end"]=tmpDF["end"].astype(int)
    tmpDF["assembled_start"]=tmpDF["assembled_start"].astype(int)
    tmpDF["assembled_end"]=tmpDF["assembled_end"].astype(int)
    tmpDF["outStart"]=np.where((tmpDF["start"]-tmpDF["assembled_start"])<0,True,False)
    tmpDF["outEnd"]=np.where((tmpDF["assembled_end"]-tmpDF["end"])<0,True,False)
    if len(tmpDF[(tmpDF["outStart"])|(tmpDF["outEnd"])])>0:
        print("="*100)
        print("out of bounds")
        display(tmpDF[displayCols].sort_values(by=["Fantom_code","AvgTPM"],ascending=[True,False]))
    if tmpDF["best_eval"].min()>float(1.000000e-15):
        print("="*100)
        print("low eval")
        display(tmpDF[displayCols].sort_values(by=["Fantom_code","AvgTPM"],ascending=[True,False]))

out of bounds


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.31956.1,ENCT00000253415.1,ALL_15233072,215311120,215311521,215303503,215310728,215301218,215311972,False,True,6e-23,2,1.083491,2.013061,=,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",6e-23,"EHH16558.1, hypothetical protein EGK_11851, partial [Macaca mulatta]"


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.45292.2,MICT00000296689.1,ALL_24533130,6856132,6856455,6855355,6871056,6855442,6871040,False,False,2e-10,2,2.124501,5.002084,=,1.0,-,2e-10,"EAW69746.1, hCG1993336 [Homo sapiens]"


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.3275.1,ENCT00000032688.1,ALL_02601129,155308176,155308409,155303646,155308682,155304469,155308581,False,False,1.0,1,1.702816,2.933612,=,1.0,-,1.0,-


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.49781.3,FTMT22700016954.1,ALL_25212766,4476168,4476227,4434286,4479273,4434321,4476229,False,False,9e-14,2,1.913384,11.181755,=,1.0,-,9e-14,"XP_003951320.1, PREDICTED: putative uncharacterized protein C8orf44 homolog [Pan troglodytes]"


out of bounds


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
1,CHS.49845.1,HBMT00001306821.1,ALL_25215205,6373249,6373497,6373659,6374239,6373651,6374175,True,False,1.0,1,1.378411,3.111006,=,1.0,-,1.0,-
0,CHS.49845.1,HBMT00001306821.1,ALL_26186802,6373249,6373497,6373523,6374097,6373651,6374175,True,False,1.0,1,0.574692,0.574692,=,1.0,-,1.0,-


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
1,CHS.49845.1,HBMT00001306821.1,ALL_25215205,6373249,6373497,6373659,6374239,6373651,6374175,True,False,1.0,1,1.378411,3.111006,=,1.0,-,1.0,-
0,CHS.49845.1,HBMT00001306821.1,ALL_26186802,6373249,6373497,6373523,6374097,6373651,6374175,True,False,1.0,1,0.574692,0.574692,=,1.0,-,1.0,-


out of bounds


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.44064.5,ENCT00000350598.1,ALL_22580291,138501885,138502238,138492319,138498124,138492277,138502892,False,True,8e-12,2,1.301831,3.144809,=,1.0,-,8e-12,"CAE92120.1, unnamed protein product [Homo sapiens]"


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.44064.5,ENCT00000350598.1,ALL_22580291,138501885,138502238,138492319,138498124,138492277,138502892,False,True,8e-12,2,1.301831,3.144809,=,1.0,-,8e-12,"CAE92120.1, unnamed protein product [Homo sapiens]"


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.55399.2,FTMT23400003453.1,ALL_28135476,35603546,35603770,35602484,35604070,35602914,35604227,False,False,1.0,1,1.462457,3.647561,=,1.0,-,1.0,-


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.52139.4,ENCT00000406489.1,ALL_25685354,145848296,145848404,145784948,146048253,145784993,146047718,False,False,2e-08,3,3.749345,7.628103,=,1.0,-,2e-08,"CAD69626.1, unnamed protein product [Homo sapiens]"


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.7549.3,HBMT00000157570.1,ALL_03990737,130213856,130214029,130208350,130254198,130213537,130252017,False,False,1e-07,2,1.695187,3.643304,=,1.0,-,1e-07,"XP_014691893.1, PREDICTED: putative malate dehydrogenase 1B isoform X2 [Equus asinus]"


out of bounds


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.52196.4,FTMT22500002352.1,ALL_25552479,149742457,149742590,149738293,149742582,149742058,149742598,False,True,2e-37,2,1.033787,1.879311,=,6.1899999999999995e-30,"sp|Q6UX73|CP089_HUMAN, UPF0764 protein C16orf89 OS=Homo sapiens GN=C16orf89 PE=1 SV=2",2e-37,"EAW91517.1, hCG1820395 [Homo sapiens]"


out of bounds


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.47994.4,MICT00000314784.1,ALL_24016408,161301198,161301527,161303443,161315228,161296771,161315245,True,False,3.9999999999999996e-30,2,2.097843,4.176972,=,1.0,-,3.9999999999999996e-30,"EAW62946.1, hCG2038800, partial [Homo sapiens]"


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.14817.2,ENCT00000121847.1,ALL_07542780,113393386,113393763,113392853,113394067,113392611,113394084,False,False,1.0,1,1.511373,1.511373,=,1.0,-,1.0,-


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.32794.4,HBMT00000881530.1,ALL_17511523,3411377,3411667,3408004,3414630,3408039,3412998,False,False,2e-08,2,1.754149,3.617588,=,1.0,-,2e-08,"ABO27831.1, PPI-82-f4 protein, partial [Homo sapiens]"


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.24083.1,MICT00000158257.1,ALL_12710628,7460990,7461006,7455925,7461115,7455844,7461135,False,False,1.0,2,0.874796,1.291903,=,1.0,-,1.0,-


low eval


Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.24098.4,HBMT00000659382.1,ALL_12842428,8996195,8996253,8974535,8999534,8974487,8998422,False,False,5e-11,3,5.632762,28.178564,=,1.0,-,5e-11,"EHH16289.1, hypothetical protein EGK_11552, partial [Macaca mulatta]"


In [58]:
tmpDF=pd.read_csv("./fantom/res/eq_ENCT00000253415.1.csv")
# tmpDF=tmpDF[tmpDF["Fantom_code"]=="="]
tmpDF["best_eval"]=tmpDF.apply(lambda row: getGreatest(row),axis=1)
tmpDF=tmpDF.merge(orf,how="left",left_on="Name",right_on="Name")
tmpDF.dropna(axis=0,inplace=True)
tmpDF["start"]=tmpDF["start"].astype(int)
tmpDF["end"]=tmpDF["end"].astype(int)
tmpDF["assembled_start"]=tmpDF["assembled_start"].astype(int)
tmpDF["assembled_end"]=tmpDF["assembled_end"].astype(int)
tmpDF["outStart"]=np.where((tmpDF["start"]-tmpDF["assembled_start"])<0,True,False)
tmpDF["outEnd"]=np.where((tmpDF["assembled_end"]-tmpDF["end"])<0,True,False)
display(tmpDF[displayCols].sort_values(by=["Fantom_code","AvgTPM"],ascending=[True,False]))

Unnamed: 0,Parent_full,name,Name,start,end,assembled_start,assembled_end,Fantom_start,Fantom_end,outStart,outEnd,best_eval,ExonNo,AvgTPM,MaxTPM,Fantom_code,SP_eval,SP_description,NR_eval,NR_description
0,CHS.31956.1,ENCT00000253415.1,ALL_15233072,215311120,215311521,215303503,215310728,215301218,215311972,False,True,6e-23,2,1.083491,2.013061,=,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",6e-23,"EHH16558.1, hypothetical protein EGK_11851, partial [Macaca mulatta]"
4,CHS.31956.1,ENCT00000253415.1,ALL_15571431,215311120,215311521,215303673,215304348,215301218,215311972,False,True,1.3700000000000001e-17,1,1.433074,1.433074,c,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",1.0,-
3,CHS.31956.1,ENCT00000253415.1,ALL_16297583,215311120,215311521,215303564,215304124,215301218,215311972,False,True,1.3700000000000001e-17,1,1.284404,1.457214,c,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",1.0,-
5,CHS.31956.1,ENCT00000253415.1,ALL_15233077,215311120,215311521,215303825,215304140,215301218,215311972,False,True,1.3700000000000001e-17,1,1.189356,1.563615,c,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",1.0,-
1,CHS.31956.1,ENCT00000253415.1,ALL_16700005,215311120,215311521,215303678,215304123,215301218,215311972,False,True,1.3700000000000001e-17,1,1.090867,1.626662,c,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",1.0,-
2,CHS.31956.1,ENCT00000253415.1,ALL_16297585,215311120,215311521,215303888,215304123,215301218,215311972,False,True,1.3700000000000001e-17,1,0.878293,0.878293,c,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",1.0,-
7,CHS.31956.1,ENCT00000253415.1,ALL_15571430,215311120,215311521,215303502,215304651,215301218,215311972,False,True,1.3700000000000001e-17,1,0.87274,1.195982,e,1.3700000000000001e-17,"sp|Q86U02|CN165_HUMAN, Putative uncharacterized protein encoded by LINC00596 OS=Homo sapiens GN=LINC00596 PE=5 SV=1",1.0,-
6,CHS.31956.1,ENCT00000253415.1,ALL_16823340,215311120,215311521,215304508,215311589,215301218,215311972,False,False,4.0000000000000003e-29,2,0.998013,0.998013,j,5.830000000000001e-23,"sp|Q8N2A0|CX062_HUMAN, Putative uncharacterized protein encoded by LINC00269 OS=Homo sapiens GN=LINC00269 PE=5 SV=1",4.0000000000000003e-29,"EHH20736.1, hypothetical protein EGK_03652, partial [Macaca mulatta]"


In [57]:
orf[orf["Name"]=="ALL_15233072"]

Unnamed: 0,Name,assembled_start,assembled_end,assembled_strand,orf_start,orf_end
1488,ALL_15233072,215303503,215310728,+,215304015,215304203
