In [None]:
#run this only if you do not already have scikit-bio and/or pandas and/or numpy
!pip install scikit-bio
!pip install pandas
!pip install numpy

In [4]:
import pandas as pd
from skbio.stats.composition import perturb, power
import numpy as np

In [5]:
df = pd.read_csv('codon_data.csv')

In [6]:
df

Unnamed: 0,Triplet,Amino acid,Fraction_combined),Frequency/ Thousand_combined,Number_combined,Saccharomyces_cerevisiae,Fraction,Frequency/ Thousand,Number,E_coli,...,Frequency/ Thousand.1,Number.1,Oryza_sativa,Fraction.2,Frequency/ Thousand.2,Number.2,N_benthamiana,Fraction.3,Frequency/ Thousand.3,Number.3
0,TAA,*,,,,TAA,0.47,1.1,6913,TAA,...,1.8,9,TAA,0.24,0.7,22360,TAA,0.32,0.7,32
1,TAG,*,,,,TAG,0.23,0.5,3312,TAG,...,0.0,0,TAG,0.31,0.8,28508,TAG,0.29,0.7,29
2,TGA,*,,,,TGA,0.30,0.7,4447,TGA,...,1.0,5,TGA,0.45,1.2,41361,TGA,0.39,0.9,39
3,GCA,A,,,,GCA,0.29,16.2,105910,GCA,...,21.1,108,GCA,0.18,17.3,591267,GCA,0.31,23.5,1029
4,GCC,A,,,,GCC,0.22,12.6,82357,GCC,...,31.6,162,GCC,0.33,30.8,1050723,GCC,0.17,12.6,550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,GTG,V,,,,GTG,0.19,10.8,70337,GTG,...,26.4,135,GTG,0.36,24.3,828681,GTG,0.25,15.6,684
60,GTT,V,,,,GTT,0.39,22.1,144243,GTT,...,16.8,86,GTT,0.23,15.5,529509,GTT,0.42,26.1,1142
61,TGG,W,,,,TGG,1.00,10.4,67789,TGG,...,10.7,55,TGG,1.00,13.8,472543,TGG,1.00,12.4,542
62,TAC,Y,,,,TAC,0.44,14.8,96596,TAC,...,14.6,75,TAC,0.60,15.1,517042,TAC,0.45,12.8,562


In [7]:
# adjust these parameters. the weights describe the relative importance of each species' composition.
# here, the yeast codon usage is considered twice as important compared to non-yeast
pseudocount = .01 #add a small amount to each component to avoid zeroing codon frequencies
sc_wt = 2.0
ec_wt = 1.0
os_wt = 1.0
nb_wt = 1.0

aa_l = list(dict.fromkeys(df['Amino acid']))

sc_l = []
ec_l = []
os_l = []
nb_l = []

for aa in aa_l:
    df_sub = df[df['Amino acid'] == aa]
    sc_l.append(np.array(df_sub['Fraction'])+pseudocount)    
    ec_l.append(np.array(df_sub['Fraction.1'])+pseudocount)
    os_l.append(np.array(df_sub['Fraction.2'])+pseudocount)
    nb_l.append(np.array(df_sub['Fraction.3'])+pseudocount)
    

comp_d = {}

for ii, aa in enumerate(aa_l):    
    comp_d[aa]=perturb(perturb(perturb(power(sc_l[ii], sc_wt), power(ec_l[ii], ec_wt)), power(os_l[ii], os_wt)), power(nb_l[ii], nb_wt))
    
comp_d

{'*': array([0.6518868 , 0.00291754, 0.34519567]),
 'A': array([0.28597182, 0.24609996, 0.03868829, 0.42923993]),
 'C': array([0.43303031, 0.56696969]),
 'D': array([0.07382833, 0.92617167]),
 'E': array([0.88497315, 0.11502685]),
 'F': array([0.32200786, 0.67799214]),
 'G': array([0.0785311 , 0.18205086, 0.01254666, 0.72687138]),
 'H': array([0.17290266, 0.82709734]),
 'I': array([0.01240118, 0.12471382, 0.862885  ]),
 'K': array([0.65365282, 0.34634718]),
 'L': array([0.01427432, 0.02176041, 0.20297567, 0.11903228, 0.14486486,
        0.49709246]),
 'M': array(1.),
 'N': array([0.28492662, 0.71507338]),
 'P': array([0.52981214, 0.02252872, 0.08653078, 0.36112835]),
 'Q': array([0.58365402, 0.41634598]),
 'R': array([0.52326591, 0.17693644, 0.00899867, 0.06975731, 0.00644436,
        0.2145973 ]),
 'S': array([0.13103875, 0.08997771, 0.26366225, 0.09723907, 0.02546738,
        0.39261483]),
 'T': array([0.22493882, 0.36226687, 0.03283024, 0.37996407]),
 'V': array([0.02488439, 0.07837

In [8]:
frac_d = {'Triplet':df['Triplet'],'Amino acid':df['Amino acid'], 'Fraction':[frac.item() for comp in comp_d.values() for frac in np.nditer(comp)]}

optim_df = pd.DataFrame(data=frac_d)

optim_df.to_csv('optim.csv')