In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

In [4]:
codon_freq = pd.read_csv('../data/mouse_codon_freq.csv')
codon_freq =codon_freq.set_index('Unnamed: 0')

In [90]:
def compute_CAI(sequence:str, codon_freq:pd.DataFrame):
    """Compute the Codon Adaptation Index (CAI) of a given protein/mRNA sequence
  
      parameters:
      sequence: str,
          sequence of the protein/mRNA transcript of interest
      codon_freq: pd.DataFrame,
          frequencies of each codon and corresponding amino acid (species specific)

      returns:s
      CAI : foat,
          computed Codon adaptation index (CAI), returned values are between 0 and 1"""
    #Convert T to U
    sequence = sequence.translate(str.maketrans("T", "U"))
    
    L = int(len(sequence)/3)
    CAI = 0
    for i in range(0,L):
        codon = sequence[3*i:3*i+3]
        freq = codon_freq.T[codon][1]
        aa = codon_freq.T[codon][0]
        freq_max = np.max((codon_freq[codon_freq['amino acid']==aa])['freq'])
        
        w = np.log(freq/freq_max)
        CAI += w
        
    CAI = np.exp(1/L * CAI)
    
    return CAI

In [91]:
amylase1 = 'ATGAAATTCTTCCTGCTGCTTTCCCTCATTGGATTCTGCTGGGCCCAATATGACCCACATACTCAATATGGACGAACTGCTATTGTCCACCTGTTTGAGTGGCGCTGGGTTGATATTGCTAAGGAATGTGAGAGATACTTAGCTCCTAATGGATTTGCAGGTGTGCAGGTCTCTCCACCCAATGAAAACATCGTAGTCCACAGCCCTTCAAGACCATGGTGGGAAAGATATCAACCAATTAGCTACAAAATATGTTCCAGGTCTGGAAATGAAGATGAATTCAGGGACATGGTGAACAGGTGCAACAATGTTGGTGTCCGTATTTATGTGGATGCTGTCATTAACCACATGTGTGGAGTGGGGGCTCAAGCTGGACAAAGCAGTACATGTGGAAGTTATTTCAACCCAAATAACAGGGACTTTCCTGGAGTTCCCTATTCTGGTTTTGACTTTAATGATGGAAAATGTAGAACTGCAAGTGGAGGTATCGAGAACTACCAAGATGCTGCTCAGGTCAGAGATTGTCGTCTGTCTGGCCTTCTGGATCTTGCACTTGAGAAAGATTATGTTCGAACCAAGGTGGCTGACTATATGAACCATCTCATTGACATTGGCGTAGCAGGGTTCAGACTTGATGCTTCTAAGCACATGTGGCCTGGAGACATAAAGGCAATTTTGGACAAACTGCATAATCTCAATACAAAATGGTTCTCCCAAGGAAGCAGACCTTTCATTTTCCAAGAGGTGATTGATCTGGGTGGTGAGGCAGTGTCAAGTAATGAGTATTTTGGAAATGGCCGTGTGACAGAATTCAAATATGGAGCAAAATTGGGCAAAGTTATGCGCAAGTGGGATGGAGAAAAGATGTCCTACTTAAAGAACTGGGGAGAAGGTTGGGGTTTGATGCCTTCTGACAGAGCCCTTGTGTTTGTGGACAACCATGACAATCAGCGAGGACATGGTGCTGGGGGAGCATCCATCTTGACATTCTGGGATGCTAGACTCTATAAAATGGCTGTTGGCTTTATGTTGGCTCATCCTTATGGTTTCACACGGGTGATGTCAAGTTACTATTGGCCAAGAAATTTCCAGAATGGAAAAGATGTCAATGACTGGGTTGGACCACCAAATAACAATGGAAAAACCAAAGAAGTGAGCATTAACCCAGACAGCACTTGTGGCAATGACTGGATCTGTGAACATCGATGGCGTCAAATAAGGAACATGGTTGCCTTCAGAAATGTCGTCAATGGTCAGCCTTTTGCAAACTGGTGGGATAATGACAGCAACCAGGTAGCTTTTGGCAGAGGAAACAAAGGATTCATTGTCTTTAACAATGATGACTGGGCTTTGTCAGAAACTTTACAGACTGGTCTTCCTGCTGGCACATACTGTGATGTCATTTCTGGAGATAAAGTCGATGGCAATTGCACTGGAATAAAAGTCTATGTTGGCAATGATGGCAAAGCTCACTTTTCTATTAGTAACTCTGCCGAAGACCCATTTATTGCAATCCATGCAGAGTCAAAAATATAA'
compute_CAI(amylase1,codon_freq)

0.7578865327134019