In [1]:
%matplotlib inline

# Imports

In [14]:
###Standard imports
import pandas as pd
from collections import Counter
from scipy import stats
import numpy as np

###Slightly specialized
from Bio import SeqIO
from Bio.Data import CodonTable

###Really specialized
import sys
sys.path.append('../../iCUB/')
import iCUB

# Define constants for genome extraction

In [3]:
genome = list(SeqIO.parse('../Data/ecoli_info/NC_000913.3.gb', 'genbank'))
assert len(genome) == 1
genome = genome[0]

length_of_upstream_seq = 30

# Establish a general dataframe describing various coding sequence fesatures from the genome

In [4]:
loci = []
gene_ids = []
cds_seqs = []
us_seqs = []
starts = []
stops = []
strands = []

for feature in genome.features:
    nt_seq_len = len(str(genome.seq))
    if feature.type == 'CDS': ###Caring only about protein coding sequences aka CDS
        try:
            locus_id = feature.qualifiers['locus_tag'][0]
        except:
            locus_id = ''
        try:
            gene_id = feature.qualifiers['gene'][0]
        except:
            gene_id = ''
        start = feature.location.start
        stop = feature.location.end
        strand = feature.location.strand
        assert stop > start
        if strand == 1:
            strand = '+'
        elif strand == -1:
            strand = '-'
        else:
            print('ERROR in strand')
            break
        if strand == '-':
            if stop + 30 > nt_seq_len: ###Case that may occur at chromosome end
                continue
            cds_seq = str(genome.seq[start:stop].reverse_complement())
            us_seq = str(genome.seq[stop:stop+length_of_upstream_seq].reverse_complement())
        elif strand == '+':
            if start < 30: ###Case that may occur at chromosome beginning
                continue
            cds_seq = str(genome.seq[start:stop])
            us_seq = str(genome.seq[start-length_of_upstream_seq:start])
        ###Add everyone to growing lists because it's a faster way to set up dataframes
        loci.append(locus_id)
        gene_ids.append(gene_id)
        starts.append(start)
        stops.append(stop)
        strands.append(strand)
        cds_seqs.append(cds_seq)
        us_seqs.append(us_seq)

In [5]:
df = pd.DataFrame(zip(loci, gene_ids, starts, stops, strands, cds_seqs, us_seqs)) 
df.columns = ['locus_tag', 'gene', 'start_loc', 'stop_loc', 'strand', 'cds_seq', 'us_seq']
print(df.shape)
df.head()

(4357, 7)


Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,us_seq
0,b0001,thrL,189,255,+,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,CAGATAAAAATTACAGAGTACACAACATCC
1,b0002,thrA,336,2799,+,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC
2,b0003,thrB,2800,3733,+,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC
3,b0004,thrC,3733,5020,+,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA
4,b0005,yaaX,5233,5530,+,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT


In [6]:
###Test that I extracted upstream sequences properly
print(length_of_upstream_seq)
print(Counter(list(df['us_seq'].str.len())))

30
Counter({30: 4357})


In [7]:
df['cds_len'] = df['stop_loc'] - df['start_loc']
assert all(df['cds_len'] == df['cds_seq'].str.len())
print('Max and min cds lengths:', df['cds_len'].max(), '\t', df['cds_len'].min())

Max and min cds lengths: 8622 	 42


**Keep track of genes that will and won't be useful for downstream analyses**

The only addition at the end should be a `well_behaved` column

In [8]:
print(df.shape)
###First ensure that locus tags are unique and drop duplicates entirely
df = df.drop_duplicates('locus_tag')

###Next set up a simple column that tracks the suitability of each gene
df['well_behaved'] = True

###Ensure that the coding sequence is a multiple of 3 (otherwise, what is happening?)
df.at[df[df['cds_seq'].str.len()%3 != 0].index, 'well_behaved'] = False

###Count the number of canonical bases
for letter in ['A', 'T', 'G', 'C']:
    df[letter] = df['cds_seq'].str.count(letter)
df['ATGC'] = df[['A', 'T', 'G', 'C']].sum(axis=1)
# ###And ensure that this number equals the coding sequence length
df.at[df[(df['stop_loc'] - df['start_loc']) != df['ATGC']].index, 'well_behaved'] = False

# ###Repeat the above procedure for upstream sequence regions
for letter in ['A', 'T', 'G', 'C']:
    df[letter] = df['us_seq'].str.count(letter)
df['ATGC'] = df[['A', 'T', 'G', 'C']].sum(axis=1)
###And ensure that this number equals the predefined upstream sequence length
df.at[df[df['ATGC'] != length_of_upstream_seq].index, 'well_behaved'] = False

###Remove those temporary columns
df = df.drop(columns=['A', 'T', 'G', 'C', 'ATGC'])
print(df.shape)

(4357, 8)
(4355, 9)


**Write a set of well-behaved coding sequences to run through ROC-SEMPPR (associated R code)**

In [9]:
with open('../Data/ecoli_info/NC_000913.3.CLEAN.CDS.fasta', 'w') as outfile:
    for index in df[df['well_behaved']==True].index:
        outfile.write('>{}\n'.format(df.loc[index]['locus_tag']))
        outfile.write('{}\n'.format(df.loc[index]['cds_seq']))

# Add in GC percent data

In [10]:
###Count the number of canonical bases
for letter in ['A', 'T', 'G', 'C']:
    df[letter] = df['cds_seq'].str.count(letter)
df['GC_percent_cds'] = df[['G', 'C']].sum(axis=1)/df[['A', 'T', 'G', 'C']].sum(axis=1)
df = df.drop(columns=['A', 'T', 'G', 'C'])
print(df.shape)

(4355, 10)


# Add the `ROC-SEMPPR` data 

(After running the `R` code offline)

`ROC-SEMPPR` is defined here:
https://academic.oup.com/gbe/article/7/6/1559/2465799

It is a pretty novel, and in my opinion under-cited / under-used, metric of codon usage bias that is rooted in population genetic theory. The calculations take some time to run, but require only the **set** of all coding sequences in question from a particular genome, with no other *a priori* information necessary. In this sense it's slightly more complex than `iCUB` but uses far less information than many other models that purport to predict gene expression from sequence information.

In [11]:
rocsemppr_df = pd.read_csv('../Data/roc_semppr_first_pass.csv')
rocsemppr_df = rocsemppr_df.drop(columns=rocsemppr_df.columns[2:])
rocsemppr_df.columns = ['locus_tag', 'roc_semppr_mean']
df = df.merge(rocsemppr_df, on='locus_tag', how='outer')
print(df.shape)
df.head()

(4355, 11)


Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,us_seq,cds_len,well_behaved,GC_percent_cds,roc_semppr_mean
0,b0001,thrL,189,255,+,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,CAGATAAAAATTACAGAGTACACAACATCC,66,True,0.515152,1.244106
1,b0002,thrA,336,2799,+,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC,2463,True,0.530654,1.034078
2,b0003,thrB,2800,3733,+,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC,933,True,0.562701,0.994168
3,b0004,thrC,3733,5020,+,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA,1287,True,0.528361,1.17675
4,b0005,yaaX,5233,5530,+,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT,297,True,0.538721,0.837528


# Add in `iCUB`

`iCUB` is a simple codon usage bias metric that was originally designed/implemented here:
https://royalsocietypublishing.org/doi/full/10.1098/rsif.2017.0667

It is of a class of codon usage bias metrics that simply calculate "disorder" from a single coding sequence with no other *a priori* information given (including knowledge of any other values in the genome in question). This class of metrics is best known for the `Effective Number of Codons`, but many subsequent works have illustrated the limitations of this basic metric and have proposed various improvements and variants. `iCUB` appears to out-perform all known metrics of this simple class.

In [12]:
icub_vals = []
for index in df[df['well_behaved']==True].index:
    cds_seq = str(df.loc[index]['cds_seq'])
    icub_vals.append(iCUB.iCUB_Calculator(cds_seq).get_iCUB())


In [25]:
scaled_icub_vals = 1 - (np.array(icub_vals) - 20.)/41.
print(max(scaled_icub_vals), min(scaled_icub_vals))
df.at[df[df['well_behaved']==True].index, 'iCUB'] = scaled_icub_vals

1.0 0.04044184838990261


# Add in the Codon Adaptation Index (`CAI`)

The `Codon Adaptation Index` is one of the most famous codon usage bias metrics, originally proposed here:
https://academic.oup.com/nar/article-abstract/15/3/1281/1166844

It has a crazy number of citations and generally performs well at the task of predicting gene expression from sequence information. However, it requires some arbitrary and extraneous information. At its heart, the metric is a **distance** metric. It measures the distance of a given coding sequence (in terms of its codon usage bias) to a **pre-defined reference set of coding sequences** that are presumed to be highly expressed, and from which codon-level weights are calculated. Some work has been done to try to choose that reference set in an automated fashion from only knowledge in the genome, but to my knowledge nothing seems to outperform the pre-defined reference sets proposed in the paper (used below). And of course, this is to be expected since that reference set is a bit of a cheat since it already requires some outside gene expression knowledge and folds this knowledge into predictions of gene expression.

In [26]:
def get_codon_dicts(n=11):
    """
    Gets the codon to amino acid look up dictionary and the amino acid to 
    (list of) codons dictionary using Biopython's CodonTable module
    """
    codon_to_aa = CodonTable.unambiguous_dna_by_id[n].forward_table
    aa_to_codons = {}
    for codon, aa in codon_to_aa.items():
        try:
            aa_to_codons[aa].append(codon)
        except:
            aa_to_codons[aa] = [codon]
    return codon_to_aa, aa_to_codons 
    
def flatten_lol(lol):
    """
    Merges a list of lists into a single large list
    """
    return [inner for outer in lol for inner in outer]

def tripletize(cds_string):
    """
    Split a coding sequence into triplets (codons)
    """
    assert len(cds_string) % 3 == 0
    return [cds_string[i:i+3] for i in range(0, len(cds_string), 3)]

def calculate_RSCUs(codon_count_dict):
    """
    Calculates the relative synonymous codon usage values for a *SINGLE* dictionary
    containing codon counts (codon:count) for *ONE* amino acid
    """
    n_total = sum(codon_count_dict.values())
    unique_codons = len(codon_count_dict.keys())
    rscu_dict = {}  
    if n_total != 0:
        for codon, codon_count in codon_count_dict.items():
            rscu_dict[codon] = codon_count / (float(n_total) / unique_codons)
    else:
        for codon, codon_count in codon_count_dict.items():
            rscu_dict[codon] = 0 
    return rscu_dict

def calculate_codon_weights(list_of_cds_strings, zero_val=0.001):
    """
    Calulates the weight of each codon from a reference set of genes
    """
    ###Get housekeeping dictionaries
    codon_to_aa, aa_to_codons = get_codon_dicts()
    
    if len(list_of_cds_strings) > 1: #when given a list of strings, make it one long list of codons
        all_codons = []
        for cds in list_of_cds_strings:
            all_codons.append(tripletize(cds))
        all_codons = flatten_lol(all_codons)
    else:
        all_codons = tripletize(list_of_cds_strings)

    all_codon_counts = Counter(all_codons) #create dictionary of codon counts in the set
    
    ###Workhorse of the code
    weights_dict = {}
    for aa, codons in aa_to_codons.items(): #iterates through single amino acids
        single_aa_codon_counts = {k: all_codon_counts.get(k, None) for k in codons}
        single_rscu_dict = calculate_RSCUs(single_aa_codon_counts) #get RSCU dictionary for the aa
        max_rscu = max(single_rscu_dict.values()) #weights are normalized by the max RSCU
        for codon, rscu_val in single_rscu_dict.items():
            if single_rscu_dict[codon] == 0:
                weights_dict[codon] = zero_val
            else:
                weights_dict[codon] = rscu_val / max_rscu
    return weights_dict

def calculate_cai(single_cds_str, weights_dict):
    """
    Calculates the CAI from a gene (string) and a dictionary of codon weight values.
    """
    codon_list = tripletize(single_cds_str)
    weights_list = []
    for codon in codon_list:
        weights_list.append(weights_dict.get(codon, None))
    weights_list =  [i for i in weights_list if i] 
    cai = stats.gmean(weights_list)
    return cai

**The CAI relies on a reference set of sequences, previously defined, to determine individual codon weight values. I have taken these from the original 1987 implementation as described for *E. coli***

In [27]:
reference_set_loci = pd.read_csv('../Data/ecoli_info/original_CAI_refset.txt', header=None, names=['locus_tag'])
ref_set_seqs = list(df.merge(reference_set_loci, on='locus_tag')['cds_seq'])
cai_weights_dict = calculate_codon_weights(ref_set_seqs)

In [28]:
cai_vals = []
for index in df[df['well_behaved']==True].index:
    cds_seq = df.loc[index]['cds_seq']
    cai_vals.append(calculate_cai(cds_seq, cai_weights_dict))
    
df.at[df[df['well_behaved']==True].index, 'CAI'] = cai_vals

# Add in knowledge of the `tAI` 

The so-called `tRNA adaptation index` is a very orthogonal metric that uses knowledge of the tRNA genes contained within the genome of interest to calculate a coding sequence specific metric of adaptation to this hypothetical tRNA pool. To my knowledge it was originally proposed here:
https://academic.oup.com/nar/article/32/17/5036/1333956

And I am taking the original codon weight values for *E. coli* from here:
https://www.pnas.org/content/107/8/3645

There was a subsequent re-implementation / variation that I'm also taking data from:
https://academic.oup.com/bioinformatics/article/33/4/589/2593585

But since that method is woefully inadequate in terms of its accessibility and usability (matlab?!), I'm taking the actual codon-level weights for this variant from yet another publication that (as best as I can tell) simply re-implemented the same basic model with more transparency:
https://genominfo.org/journal/view.php?doi=10.5808/GI.2018.16.4.e28

This is all a long wind up to say that I should probably re-implement the weight calculations from scratch one day. 

In [29]:
weights_df1 = pd.read_csv('../Data/ecoli_info/NC_000913.3.original_tAI_weights.csv', header=None)
weights_df1 = weights_df1[weights_df1[1].isnull()==False]
weights_df1.columns = ['codon', 'value']
weights_dict1 = pd.Series(weights_df1['value'].values,index=weights_df1['codon']).to_dict()
#
weights_df2 = pd.read_csv('../Data/ecoli_info/NC_000913.3.stAIcalc_weights.csv', index_col=0)
weights_df2.columns = ['codon', 'value']
weights_dict2 = pd.Series(weights_df2['value'].values,index=weights_df2['codon']).to_dict()

In [30]:
tai_vals1 = []
tai_vals2 = []
for index in df[df['well_behaved']==True].index:
    cds_seq = df.loc[index]['cds_seq']
    codons = tripletize(str(cds_seq))
    tempy1 = []
    tempy2 = []
    for codon in codons:
        tempy1.append(weights_dict1.get(codon, None))
        tempy2.append(weights_dict2.get(codon, None))
    tempy1 = [i for i in tempy1 if i]
    tempy2 = [i for i in tempy2 if i]
    tai_vals1.append(stats.gmean(tempy1))
    tai_vals2.append(stats.gmean(tempy2))

df.at[df[df['well_behaved']==True].index, 'tAI'] = tai_vals1
df.at[df[df['well_behaved']==True].index, 'stAIcalc'] = tai_vals2

# Write data to a file

In [31]:
df.to_csv('../Data/ecoli_info/current_ecoli_master_table.tsv', sep='\t', index=False)

# Scratch