#Extract fasta from gff

Notebook using Biopython and GFF to create a multi fasta file corresponding to the sequences indicated in a gff file and a reference fasta file

In [1]:
from BCBio import GFF
from Bio import SeqIO

class GffFastaExtractor (object):
    
    ##### FONDAMENTAL METHODS #####
    
    def __init__ (self, seq, gff, out_name="out"):
        """Init fonction parsing fasta and gff files"""
        
        print("Initialize GffFastaExtractor")
        
        self.seq = seq
        self.gff = gff
        self.out_name = out_name
        
        # Parse the fasta sequence and store in a dict
        print("  Parsing fasta file")
        with open (self.seq, "r") as seq_in:
            self.seq_dict = SeqIO.to_dict(SeqIO.parse(seq_in, "fasta"))
        
        # Parse the gff sequence and store in a dict
        print("  Parsing gff file")
        with open (self.gff, "r") as gff_in:
            self.gff_dict = {i.id:i for i in GFF.parse(gff_in)}

    ##### PUBLIC METHODS #####
            
    def __call__ (self, offset=0, feature_type=None):
        """Launch the extraction of features """
        
        print("Extract features and write fasta output")
        # Parse the gff and sequence dictionary to extract the sequence of the features
        
        with open (self.out_name, "w") as seq_out:
        
            for key in sorted(self.seq_dict.keys()):
                print ("  Sequence {}".format(key))
             
                if self.gff_dict[key].features:
                    for feature in self.gff_dict[key].features:
                        if not feature_type or feature.type == feature_type:
                            gff_line = self.extract_gff(key, feature)
                            seq_line = self.extract_seq(key, feature, offset)
                            seq_out.write(">{}\n{}\n".format(gff_line, seq_line))

                        if feature.sub_features:
                            for sub_feature in feature.sub_features:
                                if not feature_type or sub_feature.type == feature_type:
                                    gff_line = self.extract_gff(key, sub_feature)
                                    seq_line = self.extract_seq(key, sub_feature, offset)
                                    seq_out.write(">{}\n{}\n".format(gff_line, seq_line))
        print ("Done")
        
        
    ##### PRIVATE METHODS #####
    
    def extract_gff(self, key, feature):
        # Extract the optional fields from section which is a little messy
        qualifier_str=""
        for qualifier in ["ID","ID=exon","Parent","gene_id","transcript_id","gene_type","gene_status","gene_name","transcript_type",
             "transcript_status", "transcript_name","exon_number","exon_id","level", "protein_id","transcript_support_level",
             "ccdsid","havana_gene", "havana_transcript", "tag"]:
            if qualifier in feature.qualifiers:
                
                if type(feature.qualifiers[qualifier])==list:
                    qualifier_str+= "{}={};".format(qualifier, ",".join(feature.qualifiers[qualifier]))
                else:
                    qualifier_str+= "{}={};".format(qualifier, feature.qualifiers[qualifier])
        
        # Extract the main fields from the other sections         
        try:
            return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                key,
                feature.qualifiers["source"][0] if feature.qualifiers["source"] else "UNKNOWN",
                feature.type,
                feature.location.start+1,
                feature.location.end,
                ".",
                "+" if feature.location.strand == 1 else "-",
                ".",
                qualifier_str[:-1] ))
        
        except KeyError as E:
            print E
            print feature.qualifiers
            return ["ERROR"]
    
    def extract_seq(self, key, feature, offset=0):
        
        # Extract start and stop of the feature
        if not offset:
            start = feature.location.start+1
            end = feature.location.end
            
        # Extract and correct if outside of boundaries
        else:
            start = feature.location.start-offset+1
            if start<0:
                start=0
            end = feature.location.end+offset
            if end>len(self.seq_dict[key]):
                end=len(self.seq_dict[key])
        
        if feature.location.strand == 1:
            return str(self.seq_dict[key][start:end].seq)
        
        elif feature.location.strand == -1:
            return str(self.seq_dict[key][start:end].reverse_complement().seq)
        
        else:
            print feature
            print feature.location.strand
            raise Exception
        

## Test without offset restricted to exons 

In [2]:
E = GffFastaExtractor(seq="./test/sample.fa", gff="./test/sample.gff", out_name="./test/test_exons_no_offset.fa")
E(feature_type="exon")

Initialize GffFastaExtractor
  Parsing fasta file
  Parsing gff file
Extract features and write fasta output
  Sequence chr1
  Sequence chr2
  Sequence chr3
Done


## Test with 50 pb offset restricted to exons 

In [3]:
E = GffFastaExtractor(seq="./test/sample.fa", gff="./test/sample.gff", out_name="./test/test_exons_50pb_offset.fa")
E(feature_type="exon", offset=50)

Initialize GffFastaExtractor
  Parsing fasta file
  Parsing gff file
Extract features and write fasta output
  Sequence chr1
  Sequence chr2
  Sequence chr3
Done
