You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The following code adds a class (normRelativeCodonAdaptationIndex) to CodonUsage.py and a variable (ErillLabEcoliIndex) to CodonUsageIndices.py in the CodonUsage module within SeqUtils.
CodonUsage.py
# This code is part of the Biopython distribution and governed by its# license. Please see the LICENSE file that should have been included# as part of this package.#"""Methods for codon usage calculations."""from __future__ importprint_functionimportmathfrom .CodonUsageIndicesimportSharpEcoliIndexfrom .CodonUsageIndicesimportErillLabEcoliIndexfromBioimportSeqIO# To parse a FASTA fileCodonsDict= {
'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0,
'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0,
'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0,
'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0,
'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0,
'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0,
'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0,
'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0,
'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0,
'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0,
'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0,
'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0,
'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0}
# this dictionary shows which codons encode the same AASynonymousCodons= {
'CYS': ['TGT', 'TGC'],
'ASP': ['GAT', 'GAC'],
'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
'GLN': ['CAA', 'CAG'],
'MET': ['ATG'],
'ASN': ['AAC', 'AAT'],
'PRO': ['CCT', 'CCG', 'CCA', 'CCC'],
'LYS': ['AAG', 'AAA'],
'STOP': ['TAG', 'TGA', 'TAA'],
'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
'PHE': ['TTT', 'TTC'],
'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
'GLY': ['GGT', 'GGG', 'GGA', 'GGC'],
'ILE': ['ATC', 'ATA', 'ATT'],
'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'],
'HIS': ['CAT', 'CAC'],
'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'],
'TRP': ['TGG'],
'VAL': ['GTA', 'GTC', 'GTG', 'GTT'],
'GLU': ['GAG', 'GAA'],
'TYR': ['TAT', 'TAC']}
# DNA bases that can occupy each codon positionCodonBases= {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0}
classCodonAdaptationIndex(object):
"""A codon adaptation index (CAI) implementation. Implements the codon adaptation index (CAI) described by Sharp and Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). NOTE - This implementation does not currently cope with alternative genetic codes: only the synonymous codons in the standard table are considered. """def__init__(self):
"""Initialize the class."""self.index= {}
self.codon_count= {}
# use this method with predefined CAI indexdefset_cai_index(self, index):
"""Set up an index to be used when calculating CAI for a gene. Just pass a dictionary similar to the SharpEcoliIndex in the CodonUsageIndices module. """self.index=indexdefgenerate_index(self, fasta_file):
"""Generate a codon usage index from a FASTA file of CDS sequences. Takes a location of a Fasta file containing CDS sequences (which must all have a whole number of codons) and generates a codon usage index. RCSU values """# first make sure we're not overwriting an existing index:ifself.index!= {} orself.codon_count!= {}:
raiseValueError("an index has already been set or a codon count ""has been done. Cannot overwrite either.")
# count codon occurrences in the file.self._count_codons(fasta_file)
# now to calculate the index we first need to sum the number of times# synonymous codons were used all together.foraainSynonymousCodons:
total=0.0# RCSU values are CodonCount/((1/num of synonymous codons) * sum of# all synonymous codons)rcsu= []
codons=SynonymousCodons[aa]
forcodonincodons:
total+=self.codon_count[codon]
# calculate the RSCU value for each of the codonsforcodonincodons:
denominator=float(total) /len(codons)
rcsu.append(self.codon_count[codon] /denominator)
# now generate the index W=RCSUi/RCSUmax:rcsu_max=max(rcsu)
forcodon_index, codoninenumerate(codons):
self.index[codon] =rcsu[codon_index] /rcsu_maxdefcai_for_gene(self, dna_sequence):
"""Calculate the CAI (float) for the provided DNA sequence (string). This method uses the Index (either the one you set or the one you generated) and returns the CAI for the DNA sequence. """cai_value, cai_length=0, 0# if no index is set or generated, the default SharpEcoliIndex will# be used.ifself.index== {}:
self.set_cai_index(SharpEcoliIndex)
ifdna_sequence.islower():
dna_sequence=dna_sequence.upper()
foriinrange(0, len(dna_sequence), 3):
codon=dna_sequence[i:i+3]
ifcodoninself.index:
# these two codons are always one, exclude them:ifcodonnotin ['ATG', 'TGG']:
cai_value+=math.log(self.index[codon])
cai_length+=1# some indices may not include stop codons:elifcodonnotin ['TGA', 'TAA', 'TAG']:
raiseTypeError("illegal codon in sequence: %s.\n%s"% (codon, self.index))
returnmath.exp(cai_value/ (cai_length-1.0))
def_count_codons(self, fasta_file):
withopen(fasta_file, 'r') ashandle:
# make the codon dictionary localself.codon_count=CodonsDict.copy()
# iterate over sequence and count all the codons in the FastaFile.forcur_recordinSeqIO.parse(handle, "fasta"):
# make sure the sequence is lower caseifstr(cur_record.seq).islower():
dna_sequence=str(cur_record.seq).upper()
else:
dna_sequence=str(cur_record.seq)
foriinrange(0, len(dna_sequence), 3):
codon=dna_sequence[i:i+3]
ifcodoninself.codon_count:
self.codon_count[codon] +=1else:
raiseTypeError("illegal codon %s in gene: %s"% (codon, cur_record.id))
defprint_index(self):
"""Print out the index used. This just gives the index when the objects is printed. """foriinsorted(self.index):
print("%s\t%.3f"% (i, self.index[i]))
classnormRelativeCodonAdaptationIndex(object):
"""A normalized Relative Codon Adaptation index implementation. Implements the normalized Relative Codon Adaptation index (nRCA) described by O'Neill, Or and Erill (PLoS One. 2013 Oct 7;8(10):e76177) NOTE - This implementation does not currently cope with alternative genetic codes: only the synonymous codons in the standard table are considered. The nRCA index works similarly to the Codon Adapation Index (CAI) from Sharp and Li (1987). It uses a reference set of highly-expressed genes provided by the user, and computes the alignment of any candidate coding sequence with the codon usage patterns seen in that reference set. A known problem of CAI is that it computes the weight (index) for each codon as the ratio between the frequency of that codon in the reference set and the largest frequency among its synonymous codons. This implicitly assumes that the background nucleotide distribution is uniform. On mutationally-biased genomes (such as those found in many bacterial clades), this assumption will backfire, because CAI will attribute to translational selection the patterns of genome-wide mutational bias observed in the reference set (i.e. a weakly used codon will be overvalued by CAI if it is aligned with the overall GC% content of the reference set, and vice versa). nRCA removes this bias by computing each codon index as the ration between the observed codon frequency in the reference set and its expected frequency (inferred as the product of the positional frequencies of each the codon bases). This has been shown to improve the correlation of nRCA with gene expression values with respect to CAI (Fox and Erill, DNA Research, 17:3, 185-196, 2010). Based on the BioPython Module Bio.SeqUtils.CodonUsage code for CAI. """def__init__(self):
"""Initializes the class"""self.index= {}
self.codon_count= {}
self.first_pos_count= {}
self.second_pos_count= {}
self.third_pos_count= {}
# use this method with predefined CAI index defset_nrca_index(self, index):
"""Set up an index to be used when calculating nRCA for a gene. Just pass a dictionary similar to the ErillLabEcoliIndex in the CodonUsageIndices module. """self.index=indexdefgenerate_index(self, fasta_file):
"""Generate a codon usage index from a FASTA file of CDS sequences. Takes a location of a FASTA file containing CDS sequences (which must all have a whole number of codons) and generates the nRCA codon usage index. """# first make sure we're not overwriting an existing index: ifself.index!= {} orself.codon_count!= {}:
raiseValueError("an index has already been set or a codon count ""has been done. Cannot overwrite either.")
# count codon occurrences in the file, as well as codon position base # countsself._codon_count(fasta_file)
# compute total number of codonstotal_codons=sum(self.codon_count.values()) +0.0#add pseudocoount forcodon, countinself.codon_count.iteritems():
self.codon_count[codon] +=1/total_codonsforbase, countinself.first_pos_count.iteritems():
self.first_pos_count[base] +=0.25/total_codonsself.second_pos_count[base] +=0.25/total_codonsself.third_pos_count[base] +=0.25/total_codons# initialize dictionaries for frequenciescodon_freq=CodonsDict.copy()
first_pos_freq=CodonBases.copy()
second_pos_freq=CodonBases.copy()
third_pos_freq=CodonBases.copy()
#get relative frequencies for codons and codon positionsforcod, valueinself.codon_count.iteritems():
codon_freq[cod] =self.codon_count[cod] /total_codonsforbase, valueinself.first_pos_count.iteritems():
first_pos_freq[base] =self.first_pos_count[base] /total_codonsforbase, valueinself.second_pos_count.iteritems():
second_pos_freq[base] =self.second_pos_count[base] /total_codonsforbase, valueinself.third_pos_count.iteritems():
third_pos_freq[base] =self.third_pos_count[base] /total_codons#compute the unnormalized index valuecodon_w=CodonsDict.copy()
forcodon, wincodon_w.iteritems():
#compute the expected frequency for each codonexpected_freq=math.exp(math.log(first_pos_freq[codon[0]]) + \
math.log(second_pos_freq[codon[1]]) + \
math.log(third_pos_freq[codon[2]]))
#compute the index valuecodon_w[codon] =codon_freq[codon] /expected_freq# using the computed codon frequencies, we now normalize for amino # acid usage# now to calculate normalized index we first need to sum the number of times # synonymous codons were used all together. foraainSynonymousCodons:
codons=SynonymousCodons[aa]
# compute maximum index for codons encoding this amino acid max_codon_w=0forcodonincodons:
ifcodon_w[codon] >max_codon_w:
max_codon_w=codon_w[codon]
# compute the normalized indexforcodonincodons:
self.index[codon] =codon_w[codon] /max_codon_wdefnrca_for_gene(self, dna_sequence):
"""Calculate the nRCA (float) for the provided DNA sequence (string). This method uses the index (either the one you set or the one you generated) and returns the nRCA value for the DNA sequence. """# initialize nRCA value and length to compute geometric mean onnrca_value=0nrca_length=0# if no index is set or generated, the default ErillLabEcoliIndex will # be used. ifself.index== {}:
self.set_nrca_index(ErillLabEcoliIndex)
# uppercase DNA sequenceifdna_sequence.islower():
dna_sequence=dna_sequence.upper()
# go through each codon in the sequence and add its contribution# to the geometric mean (adding in log-space to get a product)foriinrange(0, len(dna_sequence), 3):
codon=dna_sequence[i:i+3]
ifcodoninself.index:
# ATG and TGG codons are unique for their amino acids# stop codons (TGA, TAA and TAG) are not used by nRCA,# so we exclude them:ifcodonnotin ['ATG', 'TGG', 'TGA', 'TAA', 'TAG']:
nrca_value+=math.log(self.index[codon])
nrca_length+=1elifcodonnotin ['ATG', 'TGG', 'TGA', 'TAA', 'TAG']:
raiseTypeError("Illegal codon in sequence: %s.\n%s"% (codon, self.index))
# compute nRCA for sequence (geometric mean) in log-spacereturnmath.exp(nrca_value/ (nrca_length-1.0))
def_codon_count(self, fasta_file):
withopen(fasta_file, 'r') ashandle:
# make the codon dictionary and codon positions local self.codon_count=CodonsDict.copy()
self.first_pos_count=CodonBases.copy()
self.second_pos_count=CodonBases.copy()
self.third_pos_count=CodonBases.copy()
# iterate over sequence and count all the codons in the FASTA fileforcur_recordinSeqIO.parse(handle, "fasta"):
# make sure the sequence is lower caseifstr(cur_record.seq).islower():
dna_sequence=str(cur_record.seq).upper()
else:
dna_sequence=str(cur_record.seq)
foriinrange(0, len(dna_sequence), 3):
codon=dna_sequence[i:i+3]
ifcodoninself.codon_count:
self.codon_count[codon] +=1self.first_pos_count[codon[0]] +=1self.second_pos_count[codon[1]] +=1self.third_pos_count[codon[2]] +=1else:
raiseTypeError("Illegal codon %s in gene: %s"% (codon, cur_record.id))
defprint_index(self):
"""Print out the index used. This just gives the index when the objects is printed. """foriinsorted(self.index):
print("%s\t%.3f"% (i, self.index[i]))
This looks interesting, thank you. Unfortunately as presented it is hard to see where your changes are. From a very quick inspection by eye, you add two pre-defined indices (SharpEcoliIndex and ErillLabEcoliIndex) and make the former the default.
On a practical level, are you familiar with making a pull request on GitHub, which would address that (and trigger automatic checks on things like coding style).
Would you be happy to dual license your contributions (see #898)?
Do you have some test cases for this code (we would want automated tests, there is a chapter in our tutorial about our framework).
The following code adds a class (normRelativeCodonAdaptationIndex) to CodonUsage.py and a variable (ErillLabEcoliIndex) to CodonUsageIndices.py in the CodonUsage module within SeqUtils.
CodonUsage.py
CodonUsageIndices.py
nRCA.zip
The text was updated successfully, but these errors were encountered: