biopython · ivanerill · Jun 19, 2018 · Jun 19, 2018 · Jun 19, 2018 · Jun 19, 2018
diff --git a/Bio/SeqUtils/CodonUsage.py b/Bio/SeqUtils/CodonUsage.py
@@ -10,6 +10,8 @@
 
 import math
 from .CodonUsageIndices import SharpEcoliIndex
+from .CodonUsageIndices import ErillLabEcoliIndex
+
 from Bio import SeqIO  # To parse a FASTA file
 
 
@@ -53,6 +55,8 @@
     'GLU': ['GAG', 'GAA'],
     'TYR': ['TAT', 'TAC']}
 
+# DNA bases that can occupy each codon position
+CodonBases = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0}
 
 class CodonAdaptationIndex(object):
     """A codon adaptation index (CAI) implementation.
@@ -175,3 +179,199 @@ def print_index(self):
         """
         for i in sorted(self.index):
             print("%s\t%.3f" % (i, self.index[i]))
+
+
+class normRelativeCodonAdaptationIndex(object): 
+    """A normalized Relative Codon Adaptation index implementation. 
+       Implements the normalized Relative Codon Adaptation index (nRCA) 
+       described by O'Neill, Or and Erill (PLoS One. 2013 Oct 7;8(10):e76177)
+       NOTE - This implementation does not currently cope with alternative 
+       genetic codes: only the synonymous codons in the standard table are 
+       considered.
+
+       The nRCA index works similarly to the Codon Adapation Index (CAI) from
+       Sharp and Li (1987). It uses a reference set of highly-expressed genes
+       provided by the user, and computes the alignment of any candidate coding
+       sequence with the codon usage patterns seen in that reference set.
+       A known problem of CAI is that it computes the weight (index) for each
+       codon as the ratio between the frequency of that codon in the reference 
+       set and the largest frequency among its synonymous codons. This 
+       implicitly assumes that the background nucleotide distribution is 
+       uniform. On mutationally-biased genomes (such as those found in many 
+       bacterial clades), this assumption will backfire, because CAI will
+       attribute to translational selection the patterns of genome-wide 
+       mutational bias observed in the reference set (i.e. a weakly used codon
+       will be overvalued by CAI if it is aligned with the overall GC% content
+       of the reference set, and vice versa). nRCA removes this bias by 
+       computing each codon index as the ration between the observed codon
+       frequency in the reference set and its expected frequency (inferred as
+       the product of the positional frequencies of each the codon bases).
+       This has been shown to improve the correlation of nRCA with gene 
+       expression values with respect to CAI (Fox and Erill, DNA Research, 
+       17:3, 185-196, 2010).
+       Based on the BioPython Module Bio.SeqUtils.CodonUsage code for CAI.
+    """
+
+    def __init__(self): 
+        """Initializes the class""" 
+        self.index = {} 
+        self.codon_count = {} 
+        self.first_pos_count = {}
+        self.second_pos_count = {}
+        self.third_pos_count = {}
+
+    # use this method with predefined CAI index 
+    def set_nrca_index(self, index): 
+        """Set up an index to be used when calculating nRCA for a gene. 
+            Just pass a dictionary similar to the ErillLabEcoliIndex in the 
+            CodonUsageIndices module. 
+        """ 
+        self.index = index 
+
+    def generate_index(self, fasta_file): 
+        """Generate a codon usage index from a FASTA file of CDS sequences. 
+           Takes a location of a FASTA file containing CDS sequences 
+           (which must all have a whole number of codons) and generates the 
+           nRCA codon usage index. 
+        """ 
+
+        # first make sure we're not overwriting an existing index: 
+        if self.index != {} or self.codon_count != {}: 
+            raise ValueError("an index has already been set or a codon count " 
+                             "has been done. Cannot overwrite either.") 
+
+        # count codon occurrences in the file, as well as codon position base 
+        # counts
+        self._codon_count(fasta_file) 
+
+        # compute total number of codons
+        total_codons = sum(self.codon_count.values()) + 0.0
+
+        #add pseudocoount    
+        for codon, count in self.codon_count.iteritems():
+            self.codon_count[codon] += 1 / total_codons
+
+        for base, count in self.first_pos_count.iteritems():
+            self.first_pos_count[base] += 0.25 / total_codons
+            self.second_pos_count[base] += 0.25 / total_codons
+            self.third_pos_count[base] += 0.25 / total_codons
+
+        # initialize dictionaries for frequencies
+        codon_freq = CodonsDict.copy()
+        first_pos_freq = CodonBases.copy()
+        second_pos_freq = CodonBases.copy()
+        third_pos_freq = CodonBases.copy()
+
+        #get relative frequencies for codons and codon positions
+        for cod, value in self.codon_count.iteritems():
+            codon_freq[cod] = self.codon_count[cod] / total_codons   
+
+        for base, value in self.first_pos_count.iteritems():
+            first_pos_freq[base] = self.first_pos_count[base] / total_codons
+
+        for base, value in self.second_pos_count.iteritems():
+            second_pos_freq[base] = self.second_pos_count[base] / total_codons
+
+        for base, value in self.third_pos_count.iteritems():
+            third_pos_freq[base] = self.third_pos_count[base] / total_codons        
+
+        #compute the unnormalized index value
+        codon_w = CodonsDict.copy()
+        for codon, w in codon_w.iteritems():
+            #compute the expected frequency for each codon
+            expected_freq = math.exp(math.log(first_pos_freq[codon[0]]) + \
+                            math.log(second_pos_freq[codon[1]]) + \
+                            math.log(third_pos_freq[codon[2]]))
+            #compute the index value
+            codon_w[codon] = codon_freq[codon] / expected_freq
+
+        # using the computed codon frequencies, we now normalize for amino 
+        # acid usage
+        # now to calculate normalized index we first need to sum the number of times 
+        # synonymous codons were used all together. 
+        for aa in SynonymousCodons:
+            codons = SynonymousCodons[aa]
+
+            # compute maximum index for codons encoding this amino acid            
+            max_codon_w = 0
+            for codon in codons:
+                if codon_w[codon] > max_codon_w:
+                    max_codon_w = codon_w[codon]
+
+            # compute the normalized index
+            for codon in codons:
+                self.index[codon] = codon_w[codon] / max_codon_w
+
+
+    def nrca_for_gene(self, dna_sequence): 
+        """Calculate the nRCA (float) for the provided DNA sequence (string). 
+           This method uses the index (either the one you set or the one you 
+           generated) and returns the nRCA value for the DNA sequence. 
+        """ 
+
+        # initialize nRCA value and length to compute geometric mean on
+        nrca_value = 0
+        nrca_length = 0
+
+        # if no index is set or generated, the default ErillLabEcoliIndex will 
+        # be used. 
+        if self.index == {}: 
+            self.set_nrca_index(ErillLabEcoliIndex) 
+
+        # uppercase DNA sequence
+        if dna_sequence.islower(): 
+            dna_sequence = dna_sequence.upper() 
+
+        # go through each codon in the sequence and add its contribution
+        # to the geometric mean (adding in log-space to get a product)
+        for i in range(0, len(dna_sequence), 3):
+            codon = dna_sequence[i:i + 3]
+            if codon in self.index:
+                # ATG and TGG codons are unique for their amino acids
+                # stop codons (TGA, TAA and TAG) are not used by nRCA,
+                # so we exclude them:
+                if codon not in ['ATG', 'TGG', 'TGA', 'TAA', 'TAG']:
+                    nrca_value += math.log(self.index[codon])
+                    nrca_length += 1
+                elif codon not in ['ATG', 'TGG', 'TGA', 'TAA', 'TAG']: 
+                    raise TypeError("Illegal codon in sequence: %s.\n%s"
+                    % (codon, self.index))
+
+        # compute nRCA for sequence (geometric mean) in log-space
+        return math.exp(nrca_value / (nrca_length - 1.0))
+
+    def _codon_count(self, fasta_file): 
+        with open(fasta_file, 'r') as handle: 
+
+            # make the codon dictionary and codon positions local 
+            self.codon_count = CodonsDict.copy()
+            self.first_pos_count = CodonBases.copy()
+            self.second_pos_count = CodonBases.copy()
+            self.third_pos_count = CodonBases.copy()
+
+            # iterate over sequence and count all the codons in the FASTA file
+            for cur_record in SeqIO.parse(handle, "fasta"):
+                # make sure the sequence is lower case
+                if str(cur_record.seq).islower():
+                    dna_sequence = str(cur_record.seq).upper()
+                else: 
+                    dna_sequence = str(cur_record.seq)
+                    for i in range(0, len(dna_sequence), 3):
+                        codon = dna_sequence[i:i + 3]
+                        if codon in self.codon_count:
+                            self.codon_count[codon] += 1
+                            self.first_pos_count[codon[0]] += 1
+                            self.second_pos_count[codon[1]] += 1
+                            self.third_pos_count[codon[2]] += 1
+                        else:
+                            raise TypeError("Illegal codon %s in gene: %s"
+                                            % (codon, cur_record.id))
+
+
+
+    def print_index(self):
+        """Print out the index used.
+            This just gives the index when the objects is printed.
+        """ 
+        for i in sorted(self.index):
+            print("%s\t%.3f" % (i, self.index[i]))
diff --git a/Bio/SeqUtils/CodonUsageIndices.py b/Bio/SeqUtils/CodonUsageIndices.py
@@ -1,13 +1,14 @@
-# Copyright Yair Benita Y.Benita@pharm.uu.nl
-# Biopython (http://biopython.org) license applies
+"""Codon adaption indices, including Sharp and Li (1987) E. coli index.
 
-"""Codon adaption indxes, including Sharp and Li (1987) E. coli index.
-
-Currently this module only defines a single codon adaption index from
-Sharp & Li, Nucleic Acids Res. 1987.
+This module defines the single codon adaption index from Sharp & Li, 
+Nucleic Acids Res. 1987 and the Normalized Relative Codon Adaptation index 
+values for Escherichia coli K-12, from O'Neill, Or and Erill 
+PLoS ONE. 2013 Oct 7;8(10):e76177.
 """
 
-
+# Copyright Yair Benita Y.Benita@pharm.uu.nl
+# Biopython (http://biopython.org) license applies
+# Sharp & Li CAI for E. coli K-12
 SharpEcoliIndex = {
     'GCA': 0.586, 'GCC': 0.122, 'GCG': 0.424, 'GCT': 1, 'AGA': 0.004,
     'AGG': 0.002, 'CGA': 0.004, 'CGC': 0.356, 'CGG': 0.004, 'CGT': 1, 'AAC': 1,
@@ -20,3 +21,24 @@
     'AGT': 0.085, 'TCA': 0.077, 'TCC': 0.744, 'TCG': 0.017, 'TCT': 1,
     'ACA': 0.076, 'ACC': 1, 'ACG': 0.099, 'ACT': 0.965, 'TGG': 1, 'TAC': 1,
     'TAT': 0.239, 'GTA': 0.495, 'GTC': 0.066, 'GTG': 0.221, 'GTT': 1}
+
+# Copyright Ivan Erill (erill@umbc.edu)
+# Biopython (http://biopython.org) license applies 
+# ErillLab normalized Relative Codon Adaptation for E. coli K-12
+ErillLabEcoliIndex = { 
+        'AAA' : 1.000, 'AAC' : 1.000, 'AAG' : 0.224, 'AAT' : 0.059, 
+        'ACA' : 0.123, 'ACC' : 0.985, 'ACG' : 0.121, 'ACT' : 1.000, 
+        'AGA' : 0.004, 'AGC' : 0.233, 'AGG' : 0.004, 'AGT' : 0.052, 
+        'ATA' : 0.005, 'ATC' : 1.000, 'ATG' : 1.000, 'ATT' : 0.204, 
+        'CAA' : 0.138, 'CAC' : 1.000, 'CAG' : 1.000, 'CAT' : 0.295, 
+        'CCA' : 0.172, 'CCC' : 0.008, 'CCG' : 1.000, 'CCT' : 0.056, 
+        'CGA' : 0.006, 'CGC' : 0.358, 'CGG' : 0.005, 'CGT' : 1.000, 
+        'CTA' : 0.008, 'CTC' : 0.028, 'CTG' : 1.000, 'CTT' : 0.034, 
+        'GAA' : 1.000, 'GAC' : 1.000, 'GAG' : 0.219, 'GAT' : 0.452, 
+        'GCA' : 0.904, 'GCC' : 0.124, 'GCG' : 0.543, 'GCT' : 1.000, 
+        'GGA' : 0.014, 'GGC' : 0.661, 'GGG' : 0.024, 'GGT' : 1.000, 
+        'GTA' : 0.767, 'GTC' : 0.072, 'GTG' : 0.295, 'GTT' : 1.000, 
+        'TAA' : 1.000, 'TAC' : 1.000, 'TAG' : 0.033, 'TAT' : 0.266, 
+        'TCA' : 0.112, 'TCC' : 0.702, 'TCG' : 0.021, 'TCT' : 1.000, 
+        'TGA' : 0.078, 'TGC' : 1.000, 'TGG' : 1.000, 'TGT' : 0.525, 
+        'TTA' : 0.048, 'TTC' : 1.000, 'TTG' : 0.037, 'TTT' : 0.304} 
diff --git a/CONTRIB.rst b/CONTRIB.rst
@@ -108,6 +108,7 @@ please open an issue on GitHub or mention it on the mailing list.
 - Hongbo Zhu <https://github.com/hongbo-zhu-cn>
 - Hye-Shik Chang <perky at domain fallin.lv>
 - Iddo Friedberg <idoerg at domain burnham.org>
+- Ivan Erill <erill at domain umbc.edu>
 - Jacek Śmietański <https://github.com/dadoskawina>
 - Jack Twilley <https://github.com/mathuin>
 - James Casbon <j.a.casbon at domain qmul.ac.uk>

diff --git a/NEWS.rst b/NEWS.rst
@@ -46,6 +46,9 @@ warnings anymore when imported.
 A new pairwise sequence aligner is available in Bio.Align, as an alternative
 to the existing pairwise sequence aligner in Bio.pairwise2.
 
+The CodonUsage module now supports the normalized Relative Codon Adaptation
+index, in addition to the Codon Adaptation Index.
+
 Many thanks to the Biopython developers and community for making this release
 possible, especially the following contributors:
 
@@ -55,6 +58,7 @@ possible, especially the following contributors:
 - Chris Rands
 - Connor T. Skennerton
 - Francesco Gastaldello
+- Ivan Erill (first contribution)
 - Michiel de Hoon
 - Pamela Russell (first contribution)
 - Peter Cock