Example code for encoding protein sequences probabilistically based on codon usage

In [1]:
import numpy as np
import pandas as pd

In [2]:
fname_in = 'nCov_proteome_15mers+SARS-CoV-1 N+S.csv'

In [3]:
#Codon usage of yeast: https://www.genscript.com/tools/codon-frequency-table

#Selecting codons based on amino acid.  Follow format of:
########
#def AminoAcid():
#   out = np.random.choice(['dna1','dna2',etc], p=[prob_dna1, prob_dna2, etc])
#   return out
########

def aa_A():
    codon = np.random.choice(['GCT','GCC','GCA','GCG'], p=[0.38, 0.22, 0.29, 0.11])
    return codon

def aa_C():
    codon = np.random.choice(['TGT', 'TGC'], p=[0.63,0.37])
    return codon

def aa_D():
    codon = np.random.choice(['GAT','GAC'], p=[0.65,0.35])
    return codon

def aa_E():
    codon = np.random.choice(['GAA','GAG'], p=[0.71,0.29])
    return codon

def aa_F():
    codon = np.random.choice(['TTT','TTC'], p=[0.59,0.41])
    return codon

def aa_G():
    codon = np.random.choice(['GGT','GGC','GGA','GGG'], p=[0.47,0.19,0.22,0.12])
    return codon

def aa_H():
    codon = np.random.choice(['CAT','CAC'], p=[0.64, 0.36])
    return codon

def aa_I():
    codon = np.random.choice(['ATT','ATC','ATA'], p=[0.464,0.263,0.273])
    return codon

def aa_K():
    codon = np.random.choice(['AAA','AAG'], p=[0.58,0.42])
    return codon

def aa_L():
    codon = np.random.choice(['TTA','TTG','CTT','CTC','CTA','CTG'], p=[0.2782,0.2856,0.1286,0.0569,0.1412,0.1095])
    return codon

def aa_M():
    codon = np.random.choice(['ATG'], p=[1])
    return codon

def aa_N():
    codon = np.random.choice(['AAT','AAC'], p=[0.59,0.41])
    return codon

def aa_P():
    codon = np.random.choice(['CCT','CCC','CCA','CCG'], p=[0.3098,0.1549,0.4146,0.1207])
    return codon

def aa_Q():
    codon = np.random.choice(['CAA','CAG'], p=[0.69,0.31])
    return codon

def aa_R():
    codon = np.random.choice(['CGT','CGC','CGA','CGG','AGA','AGG'], p=[0.1467,0.0587,0.0677,0.0384,0.4808,0.2077])
    return codon

def aa_S():
    codon = np.random.choice(['TCT','TCC','TCA','TCG','AGT','AGC'], p=[0.26,0.16,0.21,0.1,0.16,0.11])
    return codon

def aa_T():
    codon = np.random.choice(['ACT','ACC','ACA','ACG'], p=[0.35,0.22,0.3,0.13])
    return codon

def aa_V():
    codon = np.random.choice(['GTT','GTC','GTA','GTG'], p=[0.39,0.21,0.21,0.19])
    return codon

def aa_W():
    codon = np.random.choice(['TGG'], p=[1])
    return codon

def aa_Y():
    codon = np.random.choice(['TAT','TAC'], p=[0.56,0.44])
    return codon

In [4]:
aa_2_dna = {    
    'A': aa_A,
    'C': aa_C,
    'D': aa_D,
    'E': aa_E,
    'F': aa_F,
    'G': aa_G,
    'H': aa_H,
    'I': aa_I,
    'K': aa_K,
    'L': aa_L,
    'M': aa_M,
    'N': aa_N,
    'P': aa_P,
    'Q': aa_Q,
    'R': aa_R,
    'S': aa_S,
    'T': aa_T,
    'V': aa_V,
    'W': aa_W,
    'Y': aa_Y
    }

In [5]:
np.random.seed(100)
def revtrans(seq):
    dna_list = [aa_2_dna[x]() for x in seq] # reverse translate
    dna=''.join(dna_list) #puts letters into string
    return dna

In [6]:
def encode(fname_in):
    #read in peptides and positive control peptides
    fname_out = fname_in.strip('.csv')+'_encoded.csv'
     
    df = pd.read_csv(fname_in) #library peptides
    all_peptides = df['15mers'].values.tolist()

    #encode peptides
    dna_seqs = []
    aa_seqs = []
    
    #Go through each peptide. Encode each peptide.
    for line in all_peptides:
        seq = line.strip()
        
        dna = revtrans(seq)
        dna = 'GTTATTGCTAGCGTTTTGGCAGCT'+dna+'GGTGGATCCGGTGGCGGA'
        
        dna_seqs.append(dna)
        aa_seqs.append(seq)

    #save encoded peptides
    df = pd.DataFrame({'aa':aa_seqs,'dna':dna_seqs})
    df.to_csv(fname_out, index=False)

In [7]:
encode(fname_in)