# Change Gene and Contig Names in pyriformis GFF 


1. Open and parse GFF file to pull out information from each column

2. Rename the contigs in column 1 

3. Rename the genes in column 9 (attributes column, ID = "StePyr_1" etc)

4. Rename the ID and Parent in column 9 for transcript, CDS, intron

5. Write file

       

 
 
 

## 1. GFF Parser
courtesy of Colleen Hannon and Mike Eisen, edited for the format of my GFF file

In [1]:
# Use class to define the gff object
# It seperates each column into a particular attribute of the gff class

class gff:
    
    def __init__(self):
        
        self.seq = ""
        self.source = ""
        self.feature = ""
        self.start = ""
        self.end = ""
        self.score = ""
        self.strand = ""
        self.frame = ""
        self.attribute = {}
        
    def __iter__(self): 
        return iter([self.seq, self.source, self.feature, self.start, self.end, self.score, self.strand, self.frame, self.attribute])

## This is a function that reads in each row of the gff, and pases them into the proper 
## Attribute class

    @classmethod
    def from_fields(cls,fields):
        gff = cls()
        if len(fields) == 9:
            gff.seq = fields[0]
            gff.source = fields[1]
            gff.feature = fields[2]
            gff.start = int(fields[3])
            gff.end = int(fields[4])
            if fields[5] != ".":
                gff.score = float(fields[5])
            gff.strand = fields[6]
            gff.frame = fields[7]
            
            atts = fields[8].split(';')
            
            while("" in atts):
                atts.remove("")
        
            for att in atts:
                (k,v) = att.split('=')
                gff.attribute[k] = v
        return gff

In [2]:
gff_file = "/Volumes/albright_postdoc/pyriformis/stentor_pyriformis.20210302_final.0.2.final.proper.gff"

gffs = {}

## Only store the ones in these category.

features = [
              'CDS',
              'enhancer',
              'exon',
              'five_prime_UTR',
              'gene',
              'golden_path_region',
              'insulator',
              'intergenic',
              'intron',
              'mRNA',
              'mature_peptide',
              'miRNA',
              'ncRNA',
              'none',
              'polyA_site',
              'pre_miRNA',
              'protein',
              'pseudogene',
              'rRNA',
              'regulatory_region',
              'repeat_region',
              'silencer',
              'snRNA',
              'snoRNA',
              'tRNA',
              'tandem_repeat',
              'three_prime_UTR',
              'transposable_element',
              'transcript'
              ]

for feature in features:
    gffs[feature] = []

gffo = open(gff_file,"r", newline = '')             

for line in gffo:
    if line[0:1] != "#":
        line = line.strip('\n')
        line = line.split("\t")
        if len(line) == 9: 
            if line[2] in features: ## makes sure the row is of a feature we are interested in  
                gffs[line[2]].append(gff.from_fields(line))

## 2. Rename contigs

Contig names are a ugly mess of a string, make it nice

In [3]:
# generate a list of unique contig names
contig_list = []

for line in gffs['gene']: 
     if line.seq not in contig_list:
            contig_list.append(line.seq)
            
# generate a list of new contig names
base = 'StePyr_contig'
new_contig_list = ["{}_{}".format(base, (i+1)) for i in range(0, len(contig_list))]

# generate a dictionary with unique contig names mapped to new name
contig_dict = dict(map(lambda i,j : (i,j) , contig_list,new_contig_list))

In [4]:
# for every feature present in GFF file, replace contig name (first column of GFF file)

for feature in features: 
    for line in gffs[feature]: 
        line.seq = (line.seq if line.seq not in contig_dict else contig_dict[line.seq])

## 3. Rename genes 


My GFF file had the contig as the gene ID, meaning I had to fix non-unique names

Some of this is from Colleen and Mike, some by me

In [5]:
class gene:
    
    def __init__(self):
        
        self.id = ""
        self.children = []
    
    @classmethod
    def from_atts(cls,atts):
        gene = cls()
        if 'ID' in atts:
            gene.id = atts['ID']
        return gene

In [6]:
i = 0
new_genes_list = []

for gene_gff in gffs['gene']:

    atty = gene_gff.attribute
    
    i += 1
    
    atty['ID'] = 'StePyr_' + str(i)
    
    new_genes_list.append(atty['ID'])
        

## 4. Rename transcripts, CDS, introns

In [7]:
class transcript: 
    
    def __init__(self): 
        
        self.id = ""
        self.parent = ""
        self.children = []
        
    @classmethod
    def from_atts(cls,atts):
        transcript = cls()
        if 'ID' in atts: 
            transcript.id = atts['ID']
        if 'Parent' in atts: 
            transcript.parent = atts['Parent']
        return transcript
    
    
class CDS: 
    
    def __init__(self):
        
        self.id = ""
        self.parent = ""
        
    @classmethod
    def from_atts(cls,atts):
        CDS = cls()
        if 'ID' in atts: 
            CDS.id = atts['ID']
        if 'Parent' in atts: 
            CDS.parent = atts['Parent']
        return CDS 
    
    
class intron: 
    
    def __init__(self):
        
        #self.id = ""
        self.parent = ""
        
    @classmethod
    def from_atts(cls,atts):
        intron = cls()
        #if 'ID' in atts:
            #intron.id = atts['ID'][0]
        if 'Parent' in atts: 
            intron.parent = atts['Parent']

### Transcripts


In [8]:
old_transcript_list = []

for transcript_gff in gffs['transcript']: 
    atty = transcript_gff.attribute
    old_transcript_list.append(atty['ID'])

In [9]:
new_transcript_list = []

for i in new_genes_list: 
    new_transcript_list.append(i + '.t1')

In [10]:
new_transcript_dict = dict(zip(old_transcript_list, new_transcript_list))

In [11]:
for transcript_gff in gffs['transcript']:
   
    atty = transcript_gff.attribute
    
    for k,v in new_transcript_dict.items(): 
    
        atty['ID'] = atty['ID'].replace(k,v)
 
    atty['Parent'] = atty['ID'][:-3]

### CDSs


In [12]:
for CDS_gff in gffs['CDS']:
   
    atty = CDS_gff.attribute
    
    for k,v in new_transcript_dict.items(): 
    
        atty['ID'] = atty['ID'].replace(k,v)
 
    atty['Parent'] = atty['ID'][:-4]   

### Introns

In [13]:
for intron_gff in gffs['intron']:
    
    atty = intron_gff.attribute
    
    for k,v in new_transcript_dict.items(): 
    
        #atty['ID'] = atty['ID'].replace(k,v)
 
        atty['Parent'] = atty['Parent'].replace(k,v)

## 5. Write file

In [14]:
import csv

In [15]:
with open("draft_gff.csv", "w") as stream:
    writer = csv.writer(stream, delimiter = '\t')
    writer.writerows(gffs['CDS'] + gffs['gene'] + gffs['transcript'] + gffs['intron'])

## 6. Import file, fix it, write again

In [16]:
import pandas as pd
 
# making data frame from csv file
data = pd.read_csv("draft_gff.csv", sep = '\t', header = None)

In [17]:
data = data.sort_values([0,3])

In [18]:
# maybe there's a neater way to do this, but whatever it fixes the format!

data[8] = data[8].str.replace('{', '')
data[8] = data[8].str.replace('}', '')
data[8] = data[8].str.replace(',', ';')
data[8] = data[8].str.replace(':', '=')
data[8] = data[8].str.replace(' ', '')
data[8] = data[8].str.replace('\'', '')

In [19]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,StePyr_contig_1,AUGUSTUS,CDS,9537,19184,1.0,+,0,ID=StePyr_1.t1.cds;Parent=StePyr_1.t1
19199,StePyr_contig_1,AUGUSTUS,gene,9537,19184,1.0,+,.,ID=StePyr_1
34830,StePyr_contig_1,AUGUSTUS,transcript,9537,19184,1.0,+,.,ID=StePyr_1.t1;Parent=StePyr_1
1,StePyr_contig_1,AUGUSTUS,CDS,19876,33888,1.0,+,0,ID=StePyr_2.t1.cds;Parent=StePyr_2.t1
19200,StePyr_contig_1,AUGUSTUS,gene,19876,33888,1.0,+,.,ID=StePyr_2
...,...,...,...,...,...,...,...,...,...
12486,StePyr_contig_99,AUGUSTUS,CDS,226886,226954,1.0,+,0,ID=StePyr_10134.t1.cds;Parent=StePyr_10134.t1
29332,StePyr_contig_99,AUGUSTUS,gene,226886,227650,1.0,+,.,ID=StePyr_10134
44963,StePyr_contig_99,AUGUSTUS,transcript,226886,227650,1.0,+,.,ID=StePyr_10134.t1;Parent=StePyr_10134
52814,StePyr_contig_99,INTRONARRATOR,intron,226955,226969,1.0,+,.,Parent=StePyr_10134.t1


In [20]:
data.to_csv('/Volumes/albright_postdoc/pyriformis/stentor_pyriformis.20210302_final.0.2.final.proper.nameupdate.gff', sep = '\t', index = False, header = False)

# Final Check: 

Update contig names in the genome fasta to match these, compare to old names in IGV. 