In [1]:
import pandas as pd
import numpy as np
import os
import sys
import subprocess
from shutil import copyfile

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
version="2.02"

In [2]:
# first load all the protein coding novel genes
df=pd.read_csv("./chess2.01.gff",sep="\t",skiprows=3,names=gff3Cols)
df=df[(df["source"]=="CHESS")&(df["type"]=="gene")].reset_index(drop=True)
df=df[df["attributes"].str.contains("GENE_TYPE=protein_coding")].reset_index(drop=True)
df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,chr1,CHESS,gene,352749.0,356597.0,.,-,.,ID=CHS.12;description=potentially protein-codi...
1,chr1,CHESS,gene,499860.0,523759.0,.,-,.,ID=CHS.17;description=potentially protein-codi...
2,chr1,CHESS,gene,2314906.0,2324302.0,.,+,.,ID=CHS.144;description=potentially protein-cod...
3,chr1,CHESS,gene,4019392.0,4055669.0,.,+,.,ID=CHS.215;description=potentially protein-cod...
4,chr1,CHESS,gene,4553352.0,4571350.0,.,-,.,ID=CHS.219;description=potentially protein-cod...
5,chr1,CHESS,gene,7985921.0,8000843.0,.,+,.,ID=CHS.285;description=potentially protein-cod...
6,chr1,CHESS,gene,8912005.0,8912711.0,.,-,.,ID=CHS.305;description=potentially protein-cod...
7,chr1,CHESS,gene,10382101.0,10397576.0,.,+,.,ID=CHS.348;description=potentially protein-cod...
8,chr1,CHESS,gene,10393446.0,10397981.0,.,-,.,ID=CHS.353;description=potentially protein-cod...
9,chr1,CHESS,gene,11063249.0,11066558.0,.,-,.,ID=CHS.374;description=potentially protein-cod...


In [3]:
# now need to extract gene IDs
df["id"]=df["attributes"].str.split("=",expand=True)[1].str.split(";",expand=True)[0]
df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,id
0,chr1,CHESS,gene,352749.0,356597.0,.,-,.,ID=CHS.12;description=potentially protein-codi...,CHS.12
1,chr1,CHESS,gene,499860.0,523759.0,.,-,.,ID=CHS.17;description=potentially protein-codi...,CHS.17
2,chr1,CHESS,gene,2314906.0,2324302.0,.,+,.,ID=CHS.144;description=potentially protein-cod...,CHS.144
3,chr1,CHESS,gene,4019392.0,4055669.0,.,+,.,ID=CHS.215;description=potentially protein-cod...,CHS.215
4,chr1,CHESS,gene,4553352.0,4571350.0,.,-,.,ID=CHS.219;description=potentially protein-cod...,CHS.219
5,chr1,CHESS,gene,7985921.0,8000843.0,.,+,.,ID=CHS.285;description=potentially protein-cod...,CHS.285
6,chr1,CHESS,gene,8912005.0,8912711.0,.,-,.,ID=CHS.305;description=potentially protein-cod...,CHS.305
7,chr1,CHESS,gene,10382101.0,10397576.0,.,+,.,ID=CHS.348;description=potentially protein-cod...,CHS.348
8,chr1,CHESS,gene,10393446.0,10397981.0,.,-,.,ID=CHS.353;description=potentially protein-cod...,CHS.353
9,chr1,CHESS,gene,11063249.0,11066558.0,.,-,.,ID=CHS.374;description=potentially protein-cod...,CHS.374


In [4]:
# now let's extract all chess gene ids from the protein.fa file
pgIDs=[]
with open("./chess2.01.protein.fa","r") as inFP:
    for line in inFP.readlines():
        if line[0]==">":
            curID=line.split(" ")[-1].split("gene=")[-1].strip()
            if curID[:4]=="CHS.":
                pgIDs.append(curID)
inFP.close()
pgIDs=set(pgIDs)
print(len(pgIDs))

1218


In [5]:
# now create a set of annotations IDS from the df to look at the difference
agIDs=set(df["id"].tolist())
print(len(agIDs))

1178


In [6]:
# now find the difference - which IDs are suspicious in the proteins.fa file
CDStoRemove=pgIDs-agIDs
print(len(CDStoRemove))
CDStoRemove

40


{'CHS.12707',
 'CHS.13721',
 'CHS.16010',
 'CHS.18851',
 'CHS.1895',
 'CHS.20737',
 'CHS.20929',
 'CHS.2243',
 'CHS.22699',
 'CHS.22717',
 'CHS.23255',
 'CHS.24070',
 'CHS.24382',
 'CHS.26653',
 'CHS.30768',
 'CHS.3226',
 'CHS.33898',
 'CHS.34886',
 'CHS.35336',
 'CHS.35998',
 'CHS.36118',
 'CHS.43611',
 'CHS.46428',
 'CHS.46971',
 'CHS.50923',
 'CHS.5165',
 'CHS.51749',
 'CHS.51922',
 'CHS.53081',
 'CHS.54486',
 'CHS.56484',
 'CHS.57117',
 'CHS.57814',
 'CHS.58651',
 'CHS.58832',
 'CHS.6212',
 'CHS.7754',
 'CHS.7960',
 'CHS.9165',
 'CHS.9549'}

In [7]:
# now need ot modify the annotations by removing CDS entries corresponding to these genes

# first the chess2.01.gff
outFP=open("./chess2.02_and_refseq.gff","w+")

sources_and_refseq=[]

count=0
with open("chess2.01_and_refseq.gff","r") as inFP:
    for line in inFP.readlines():
        if not line[0]=="#": # skip comments, otherwise proceed safely
            lineCols=line.split("\t")
            if lineCols[2]=="CDS":
                geneID=".".join(lineCols[8].split("Parent=")[-1].strip().split(".")[:-1])
                if geneID in CDStoRemove:
                    sources_and_refseq.append(lineCols[1])
                    continue
            outFP.write(line)
        else:
            if line[:7]=="##CHESS":
                outFP.write("##CHESS"+version+"\n")
            else:
                outFP.write(line)
            continue

inFP.close()
outFP.close()

In [8]:
# now need ot modify the annotations by removing CDS entries corresponding to these genes

# first the chess2.01.gff
outFP=open("./chess2.02_assembly.gff","w+")

sources_assembly=[]

count=0
with open("chess2.01_assembly.gff","r") as inFP:
    for line in inFP.readlines():
        if not line[0]=="#": # skip comments, otherwise proceed safely
            lineCols=line.split("\t")
            if lineCols[2]=="CDS":
                geneID=".".join(lineCols[8].split("Parent=")[-1].strip().split(".")[:-1])
                if geneID in CDStoRemove:
                    sources_assembly.append(lineCols[1])
                    continue
            outFP.write(line)
        else:
            if line[:7]=="##CHESS":
                outFP.write("##CHESS"+version+"\n")
            else:
                outFP.write(line)
            continue

inFP.close()
outFP.close()

In [9]:
# now need ot modify the annotations by removing CDS entries corresponding to these genes

# first the chess2.01.gff
outFP=open("./chess2.02.gff","w+")

sources=[]

count=0
with open("chess2.01.gff","r") as inFP:
    for line in inFP.readlines():
        if not line[0]=="#": # skip comments, otherwise proceed safely
            lineCols=line.split("\t")
            if lineCols[2]=="CDS":
                geneID=".".join(lineCols[8].split("Parent=")[-1].strip().split(".")[:-1])
                if geneID in CDStoRemove:
                    sources.append(lineCols[1])
                    continue
            outFP.write(line)
        else:
            if line[:7]=="##CHESS":
                outFP.write("##CHESS"+version+"\n")
            else:
                outFP.write(line)
            continue

inFP.close()
outFP.close()

In [10]:
copyfile("./mapfile2.01.txt","./mapfile2.02.txt")
copyfile("./chess2.01.genes","./chess2.02.genes")
copyfile("./chess2.01.transcripts","./chess2.02.transcripts")

'./chess2.02.transcripts'

In [11]:
print(set(sources_and_refseq))
print(set(sources_assembly))
print(set(sources))

{'FANTOM', 'CHESS'}
{'FANTOM', 'CHESS'}
{'FANTOM', 'CHESS'}


In [12]:
# now to convert the gff file into a proteins.fa file
subprocess.call(["gffread","-y","./chess2.02.protein.fa","-g","/home/sparrow/genomicData/hg38/hg38_p8.fa","./chess2.02.gff"])

0