In [1]:
%matplotlib inline

In [43]:
from Bio import SeqIO
from Bio.Data import CodonTable
import pandas as pd

In [3]:
genome = list(SeqIO.parse('../Data/ecoli_info/NC_000913.3.gb', 'genbank'))
assert len(genome) == 1
genome = genome[0]

In [4]:
loci = []
gene_ids = []
cds_seqs = []
us_seqs = []
starts = []
stops = []
strands = []

for feature in genome.features:
    nt_seq_len = len(str(genome.seq))
    if feature.type == 'CDS':
        try:
            locus_id = feature.qualifiers['locus_tag'][0]
        except:
            locus_id = ''
        try:
            gene_id = feature.qualifiers['gene'][0]
        except:
            gene_id = ''
        start = feature.location.start
        stop = feature.location.end
        strand = feature.location.strand
        if strand == 1:
            strand = '+'
        elif strand == -1:
            strand = '-'
        else:
            print('ERROR')
            break
        if strand == '-':
            if stop + 30 > nt_seq_len:
                continue
            cds_seq = str(genome.seq[start:stop].reverse_complement())
            us_seq = str(genome.seq[stop:stop+30].reverse_complement())
        elif strand == '+':
            if start < 30:
                continue
            cds_seq = str(genome.seq[start:stop])
            us_seq = str(genome.seq[start-30:start])
        else:
            print('MAJOR ERROR')
            break
        loci.append(locus_id)
        gene_ids.append(gene_id)
        starts.append(start)
        stops.append(stop)
        strands.append(strand)
        cds_seqs.append(cds_seq)
        us_seqs.append(us_seq)

In [108]:
df = pd.DataFrame(zip(loci, gene_ids, starts, stops, strands, cds_seqs, us_seqs)) 
df.columns = ['locus_tag', 'gene', 'start_loc', 'stop_loc', 'strand', 'cds_seq', 'us_seq']
print(df.shape)
df.head()

(4357, 7)


Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,us_seq
0,b0001,thrL,189,255,+,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,CAGATAAAAATTACAGAGTACACAACATCC
1,b0002,thrA,336,2799,+,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC
2,b0003,thrB,2800,3733,+,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC
3,b0004,thrC,3733,5020,+,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA
4,b0005,yaaX,5233,5530,+,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT


In [110]:
df = df.drop_duplicates('locus_tag')

# CAI calc stuff

In [80]:
def get_codon_dicts(n=11):
    """
    """
    codon_to_aa = CodonTable.unambiguous_dna_by_id[11].forward_table
    aa_to_codons = {}
    for codon, aa in codon_to_aa.items():
        try:
            aa_to_codons[aa].append(codon)
        except:
            aa_to_codons[aa] = [codon]
    return codon_to_aa, aa_to_codons 
            
# def get_single_codon_counts(AminoAcid, listofCodons):
#     """Counts instances of each alternative codon for an amino acid in a list"""
#     dictOfCodonCounts = {}  
#     for codonOption in AminoAcid[1]:
#         count = listofCodons.count(codonOption)
#         dictOfCodonCounts[codonOption] = count
#     return dictOfCodonCounts
    
def flatten_lol(lol):
    """
    Merges together lists within a list
    """
    return [inner for outer in lol for inner in outer]

def tripletize(cds_string):
    """
    Ensure an even length sequence and split into codons
    """
    assert len(cds_string) % 3 == 0
    return [cds_string[i:i+3] for i in range(0, len(cds_string), 3)]

def calculate_RSCUs(codon_count_dict):
    n_total = sum(codon_count_dict.values())
    unique_codons = len(codon_count_dict.keys())
    rscu_dict = {}  
    if n_total != 0:
        for codon, codon_count in codon_count_dict.items():
            RSCU = codon_count / (float(n_total) / unique_codons)
            rscu_dict[codon] = RSCU 
    else:
        for codon, codon_count in codon_count_dict.items():
            rscu_dict[codon] = 0 
    return rscu_dict

def calculate_weights(list_of_cds_strings, zero_val=0.001):
    """
    Calulates the weight of each codon from a reference set of genes
    """
    from collections import Counter
    codon_to_aa, aa_to_codons = get_codon_dicts()
    
    if len(list_of_cds_strings) > 1: #when it is a list of genes, make it one gene
        all_codons = []
        for cds in list_of_cds_strings:
            all_codons.append(tripletize(cds))
        all_codons = flatten_lol(all_codons)
    else:
        all_codons = tripletize(list_of_cds_strings)
        
    all_codon_counts = Counter(all_codons)
    
    
    weights_dict = {}
    for aa, codons in aa_to_codons.items():
        single_aa_codon_counts = {k: all_codon_counts.get(k, None) for k in codons}
        single_rscu_dict = calculate_RSCUs(single_aa_codon_counts)
        max_rscu = max(single_rscu_dict.values())
        for codon, rscu_val in single_rscu_dict.items():
            if single_rscu_dict[codon] == 0:
                weights_dict[codon] = zero_val
            else:
                weights_dict[codon] = rscu_val / max_rscu
    return weights_dict

In [111]:
reference_set_loci = pd.read_csv('../Data/ecoli_info/original_CAI_refset.txt', header=None, names=['locus_tag'])
ref_set_seqs = list(df.merge(reference_set_loci, on='locus_tag')['cds_seq'])
cai_weights_dict = calculate_weights(ref_set_seqs)

In [112]:
from scipy.stats import gmean
def calculate_cai(cds_seq, weights_dict):
    codon_list = tripletize(cds_seq)
    weights_list = []
    for codon in codon_list:
        weights_list.append(weights_dict.get(codon, None))
#     geometricMean = exp(sum([log(x) for x in listy])/float(len(listy)))
    weights_list =  [i for i in weights_list if i] 
    cai = gmean(weights_list)
    return cai

In [113]:
cai_vals = []
for index in df.index[:]:
    cds_seq = df.loc[index]['cds_seq']
    if len(cds_seq) % 3 != 0:
        cai_vals.append(None)
        continue
    cai_vals.append(calculate_cai(cds_seq, cai_weights_dict))

In [114]:
df['CAI'] = cai_vals

In [115]:
df.to_csv('../Data/ecoli_info/temp_ecoli_master_table.tsv', sep='\t', index=False)