In [1]:
%matplotlib inline

# Imports

In [2]:
###Standard imports
import pandas as pd
from collections import Counter
from scipy import stats
import numpy as np
import re

###Slightly specialized
from Bio import SeqIO
from Bio.Data import CodonTable

###Really specialized
import sys
sys.path.append('../../iCUB/')
import iCUB

# Define constants for genome extraction

In [3]:
# organism = 'ecoli'
organism = 'scer'

In [4]:
if organism == 'ecoli':
    genome = list(SeqIO.parse('../Data/ecoli_info/NC_000913.3.gb', 'genbank'))
    assert len(genome) == 1
    genome = genome[0]
    
elif organism == 'scer':
    features = list(SeqIO.parse('../Data/scer_info/GCA_000269885.1_ASM26988v1_cds_from_genomic.fna', 'fasta'))
    assert len(features) > 0
    print(len(features))

else:
    raise Exception('Incorrect organism identifier')

5451


# Establish a general dataframe describing various coding sequence fesatures from the genome

In [5]:
loci = []
gene_ids = []
cds_seqs = []
starts = []
stops = []
strands = []

if organism == 'ecoli':
    for feature in genome.features:
        nt_seq_len = len(str(genome.seq))
        if feature.type == 'CDS': ###Caring only about protein coding sequences aka CDS
            try:
                locus_id = feature.qualifiers['locus_tag'][0]
            except:
                locus_id = ''
            try:
                gene_id = feature.qualifiers['gene'][0]
            except:
                gene_id = ''
            start = feature.location.start
            stop = feature.location.end
            strand = feature.location.strand
            assert stop > start
            if strand == 1:
                strand = '+'
            elif strand == -1:
                strand = '-'
            else:
                print('ERROR in strand')
                break
            if strand == '-':
                cds_seq = str(genome.seq[start:stop].reverse_complement())
            elif strand == '+':
                cds_seq = str(genome.seq[start:stop])
            ###Add everyone to growing lists because it's a faster way to set up dataframes
            loci.append(locus_id)
            gene_ids.append(gene_id)
            starts.append(start)
            stops.append(stop)
            strands.append(strand)
            cds_seqs.append(cds_seq)

elif organism == 'scer':
    for feature in features:
        desc_list = re.findall("\[(.*?)\]", feature.description)
        desc_dict = {desc.split('=')[0]:desc.split('=')[1] for desc in desc_list}
        if desc_dict['gbkey'] != 'CDS':
            raise Exception('Somehow found a non-cds entry')
        locus_id = desc_dict['locus_tag']
        gene_id = desc_dict['protein']
        if ('join' in desc_dict['location']) or ('>' in desc_dict['location']) or ('<' in desc_dict['location']):
            start = None
            stop = None
            strand = None
        elif 'complement' in desc_dict['location']:
            location = desc_dict['location'][desc_dict['location'].index('(')+1:].split('..')
            assert len(location)==2
            start = int(location[0])
            stop = int(location[1].rstrip(')'))
            strand = '-'
            assert stop > start

        else:
            location = desc_dict['location'].split('..')
            assert len(location) == 2
            start = int(location[0])
            stop = int(location[1])
            strand = '+'
            assert stop > start

        cds_seq = str(feature.seq)
        
        loci.append(locus_id)
        gene_ids.append(gene_id)
        starts.append(start)
        stops.append(stop)
        strands.append(strand)
        cds_seqs.append(cds_seq)
else:
    raise Exception('Incorrect organism identifier')

In [6]:
df = pd.DataFrame(zip(loci, gene_ids, starts, stops, strands, cds_seqs)) 
df.columns = ['locus_tag', 'gene', 'start_loc', 'stop_loc', 'strand', 'cds_seq']
print(df.shape)
df.head()

(5451, 6)


Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq
0,CENPK1137D_4927,Gdh3p,31625.0,32998.0,+,ATGACAAGCGAACCAGAGTTTCAGCAGGCTTACGATGAGATCGTTT...
1,CENPK1137D_4938,Bdh2p,33506.0,34759.0,+,ATGAGAGCCTTAGCGTATTTCGGTAAAGGTAACATCAGATTCACCA...
2,CENPK1137D_4949,Bdh1p,35211.0,36359.0,+,ATGAGAGCTTTGGCATATTTCAAGAAGGGTGATATTCACTTCACTA...
3,CENPK1137D_4960,Ecm1p,36565.0,37203.0,+,ATGTGGGAACAAAGACGACAAAAGGTAGTTTTTTCCTTGACTATAC...
4,CENPK1137D_4971,Cne1p,37520.0,39028.0,+,ATGAAATTTTCTGCGTATTTATGGTGGCTGTTTTTGAATCTAGCGT...


In [7]:
if organism == 'ecoli':
    df['cds_len'] = df['stop_loc'] - df['start_loc']
    assert all(df['cds_len'] == df['cds_seq'].str.len())
    print('Max and min cds lengths:', df['cds_len'].max(), '\t', df['cds_len'].min())
elif organism == 'scer':
    df['cds_len'] = df['cds_seq'].str.len()
    print('Max and min cds lengths:', df['cds_len'].max(), '\t', df['cds_len'].min())
else:
    raise Exception('Incorrect organism identifier')

Max and min cds lengths: 14733 	 78


**Keep track of genes that will and won't be useful for downstream analyses**

The only addition at the end should be a `well_behaved` column

In [8]:
print(df.shape)
###First ensure that locus tags are unique and drop duplicates entirely
df = df.drop_duplicates('locus_tag')

###Next set up a simple column that tracks the suitability of each gene
df['well_behaved'] = True

###Ensure that the coding sequence is a multiple of 3 (otherwise, what is happening?)
df.at[df[df['cds_len']%3 != 0].index, 'well_behaved'] = False

###Count the number of canonical bases
for letter in ['A', 'T', 'G', 'C']:
    df[letter] = df['cds_seq'].str.count(letter)
df['ATGC'] = df[['A', 'T', 'G', 'C']].sum(axis=1)
###And ensure that this number equals the coding sequence length
df.at[df[df['cds_len'] != df['ATGC']].index, 'well_behaved'] = False

###Remove those temporary columns
df = df.drop(columns=['A', 'T', 'G', 'C', 'ATGC'])
print(df.shape)

###Remove suspiciously(?) long genes
len_to_cut = df['cds_len'].median()*5
df.at[df[df['cds_len']>len_to_cut].index, 'well_behaved'] = False

print()
print('Well behaved?')
print(df['well_behaved'].value_counts())

(5451, 7)
(5451, 8)

Well behaved?
True     5398
False      53
Name: well_behaved, dtype: int64


In [9]:
df[df['cds_len'] < df['cds_len'].median()/10]

Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,cds_len,well_behaved
613,CENPK1137D_3732,Ost4p,37455.0,37565.0,+,ATGATCTCTGATGAACAGCTGAACTCCTTGGCCATCACCTTCGGTA...,111,True
1238,CENPK1137D_4287,Mfa1p,1383872.0,1383982.0,+,ATGCAACCATCTACCGCTACCGCCGCTCCAAAAGAAAAGACCAGCA...,111,True
2654,CENPK1137D_1406,Rpl39p,58821.0,58940.0,+,ATGGCTAAGGCTAAGAAGCAAAACAGACCATTGCCACAATGGATCA...,120,True
3479,CENPK1137D_564,Rpl37ap,507862.0,507939.0,+,ATGAGATACTTGAAACACGTTTCAAGAAGATTCAAGAACGGTTTCC...,78,True
3721,CENPK1137D_832,Yrf1-2p,1050972.0,1051073.0,+,ATGCGTACTTTCACTGACTTTGTTTCCGGCGCACCTATTGTAAGGA...,102,True
4315,CENPK1137D_2535,Mfa2p,346260.0,346376.0,+,ATGCAACCGATCACCACTGCTTCCACACAAGCCACTCAGAAGGATA...,117,True


**Write a set of well-behaved coding sequences to run through ROC-SEMPPR (associated R code)**

In [10]:
if organism == 'ecoli':
    with open('../Data/ecoli_info/NC_000913.3.CLEAN.CDS.fasta', 'w') as outfile:
        for index in df[df['well_behaved']==True].index:
            outfile.write('>{}\n'.format(df.loc[index]['locus_tag']))
            outfile.write('{}\n'.format(df.loc[index]['cds_seq']))
elif organism == 'scer':
    with open('../Data/scer_info/SCER.CLEAN.CDS.fasta', 'w') as outfile:
        for index in df[df['well_behaved']==True].index:
            outfile.write('>{}\n'.format(df.loc[index]['locus_tag']))
            outfile.write('{}\n'.format(df.loc[index]['cds_seq']))
else:
    raise Exception('Incorrect organism identifier')

# Add in GC percent data

In [11]:
###Count the number of canonical bases
for letter in ['A', 'T', 'G', 'C']:
    df[letter] = df['cds_seq'].str.count(letter)
df['GC_percent_cds'] = df[['G', 'C']].sum(axis=1)/df[['A', 'T', 'G', 'C']].sum(axis=1)
df = df.drop(columns=['A', 'T', 'G', 'C'])
print(df.shape)

(5451, 9)


# Add the `ROC-SEMPPR` data 

(After running the `R` code offline)

`ROC-SEMPPR` is defined here:
https://academic.oup.com/gbe/article/7/6/1559/2465799

It is a pretty novel, and in my opinion under-cited / under-used, metric of codon usage bias that is rooted in population genetic theory. The calculations take some time to run, but require only the **set** of all coding sequences in question from a particular genome, with no other *a priori* information necessary. In this sense it's slightly more complex than `iCUB` but uses far less information than many other models that purport to predict gene expression from sequence information.

In [12]:
if organism == 'ecoli':
    rocsemppr_df = pd.read_csv('../Data/ecoli_info/roc_semppr_ecoli.csv')
elif organism == 'scer':
    rocsemppr_df = pd.read_csv('../Data/scer_info/roc_semppr_scer.csv')
print(rocsemppr_df.shape)
rocsemppr_df.head()

(5398, 9)


Unnamed: 0,GeneID,Mean,Mean.log10,Std.Dev,log10.Std.Dev,0.025,0.975,log10.0.025,log10.0.975
0,CENPK1137D_4927,1.077532,0.022556,0.219183,0.096088,0.646828,1.519269,-0.189211,0.181635
1,CENPK1137D_4938,0.559819,-0.288175,0.205555,0.193109,0.172446,0.969412,-0.763346,-0.013492
2,CENPK1137D_4949,1.741424,0.235808,0.262108,0.067413,1.237648,2.264724,0.092597,0.355015
3,CENPK1137D_4960,0.997815,-0.030071,0.333965,0.171363,0.35916,1.674934,-0.444713,0.223998
4,CENPK1137D_4971,0.164382,-0.877871,0.101657,0.307771,0.026633,0.414207,-1.574574,-0.382784


In [13]:
rocsemppr_df = rocsemppr_df.drop(columns=rocsemppr_df.columns[2:])
rocsemppr_df.columns = ['locus_tag', 'roc_semppr_mean']
df = df.merge(rocsemppr_df, on='locus_tag', how='outer')
print(df.shape)
df.head()

(5451, 10)


Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,cds_len,well_behaved,GC_percent_cds,roc_semppr_mean
0,CENPK1137D_4927,Gdh3p,31625.0,32998.0,+,ATGACAAGCGAACCAGAGTTTCAGCAGGCTTACGATGAGATCGTTT...,1374,True,0.485444,1.077532
1,CENPK1137D_4938,Bdh2p,33506.0,34759.0,+,ATGAGAGCCTTAGCGTATTTCGGTAAAGGTAACATCAGATTCACCA...,1254,True,0.484848,0.559819
2,CENPK1137D_4949,Bdh1p,35211.0,36359.0,+,ATGAGAGCTTTGGCATATTTCAAGAAGGGTGATATTCACTTCACTA...,1149,True,0.449956,1.741424
3,CENPK1137D_4960,Ecm1p,36565.0,37203.0,+,ATGTGGGAACAAAGACGACAAAAGGTAGTTTTTTCCTTGACTATAC...,639,True,0.403756,0.997815
4,CENPK1137D_4971,Cne1p,37520.0,39028.0,+,ATGAAATTTTCTGCGTATTTATGGTGGCTGTTTTTGAATCTAGCGT...,1509,True,0.4334,0.164382


# Add in `iCUB`

`iCUB` is a simple codon usage bias metric that was originally designed/implemented here:
https://royalsocietypublishing.org/doi/full/10.1098/rsif.2017.0667

It is of a class of codon usage bias metrics that simply calculate "disorder" from a single coding sequence with no other *a priori* information given (including knowledge of any other values in the genome in question). This class of metrics is best known for the `Effective Number of Codons`, but many subsequent works have illustrated the limitations of this basic metric and have proposed various improvements and variants. `iCUB` appears to out-perform all known metrics of this simple class.

In [14]:
icub_vals = []
for index in df[df['well_behaved']==True].index:
    cds_seq = str(df.loc[index]['cds_seq'])
    icub_vals.append(iCUB.iCUB_Calculator(cds_seq).get_iCUB())


In [15]:
scaled_icub_vals = 1 - (np.array(icub_vals) - 20.)/41.
print(max(scaled_icub_vals), min(scaled_icub_vals))
df.at[df[df['well_behaved']==True].index, 'iCUB'] = scaled_icub_vals

0.8810444628772327 0.01332116017728835


# Add in the Codon Adaptation Index (`CAI`)

The `Codon Adaptation Index` is one of the most famous codon usage bias metrics, originally proposed here:
https://academic.oup.com/nar/article-abstract/15/3/1281/1166844

It has a crazy number of citations and generally performs well at the task of predicting gene expression from sequence information. However, it requires some arbitrary and extraneous information. At its heart, the metric is a **distance** metric. It measures the distance of a given coding sequence (in terms of its codon usage bias) to a **pre-defined reference set of coding sequences** that are presumed to be highly expressed, and from which codon-level weights are calculated. Some work has been done to try to choose that reference set in an automated fashion from only knowledge in the genome, but to my knowledge nothing seems to outperform the pre-defined reference sets proposed in the paper (used below). And of course, this is to be expected since that reference set is a bit of a cheat since it already requires some outside gene expression knowledge and folds this knowledge into predictions of gene expression.

So I wrote some code to calculate weights starting from the reference set gene names (see below, commented). In the end, to ensure that I was getting things right or mostly right for *S. cerevisiae* I skipped this step and just copy/pasted the table of codon weights from the original paper into a `.txt` (tab-separated) file. 

In [16]:
###If you want to compute it yourself, but I'm just going to load the dictionaries from the orignal
###paper to make things consistent
# def get_codon_dicts(n=11):
#     """
#     Gets the codon to amino acid look up dictionary and the amino acid to 
#     (list of) codons dictionary using Biopython's CodonTable module
#     """
#     codon_to_aa = CodonTable.unambiguous_dna_by_id[n].forward_table
#     aa_to_codons = {}
#     for codon, aa in codon_to_aa.items():
#         try:
#             aa_to_codons[aa].append(codon)
#         except:
#             aa_to_codons[aa] = [codon]
#     return codon_to_aa, aa_to_codons 
    
# def flatten_lol(lol):
#     """
#     Merges a list of lists into a single large list
#     """
#     return [inner for outer in lol for inner in outer]

# def tripletize(cds_string):
#     """
#     Split a coding sequence into triplets (codons)
#     """
#     assert len(cds_string) % 3 == 0
#     return [cds_string[i:i+3] for i in range(0, len(cds_string), 3)]

# def calculate_RSCUs(codon_count_dict):
#     """
#     Calculates the relative synonymous codon usage values for a *SINGLE* dictionary
#     containing codon counts (codon:count) for *ONE* amino acid
#     """
#     n_total = sum(codon_count_dict.values())
#     unique_codons = len(codon_count_dict.keys())
#     rscu_dict = {}  
#     if n_total != 0:
#         for codon, codon_count in codon_count_dict.items():
#             rscu_dict[codon] = codon_count / (float(n_total) / unique_codons)
#     else:
#         for codon, codon_count in codon_count_dict.items():
#             rscu_dict[codon] = 0 
#     return rscu_dict

# def calculate_codon_weights(list_of_cds_strings, zero_val=0.001):
#     """
#     Calulates the weight of each codon from a reference set of genes
#     """
#     ###Get housekeeping dictionaries
#     codon_to_aa, aa_to_codons = get_codon_dicts()
    
#     if len(list_of_cds_strings) > 1: #when given a list of strings, make it one long list of codons
#         all_codons = []
#         for cds in list_of_cds_strings:
#             all_codons.append(tripletize(cds))
#         all_codons = flatten_lol(all_codons)
#     else:
#         all_codons = tripletize(list_of_cds_strings)

#     all_codon_counts = Counter(all_codons) #create dictionary of codon counts in the set
    
#     ###Workhorse of the code
#     weights_dict = {}
#     for aa, codons in aa_to_codons.items(): #iterates through single amino acids
#         single_aa_codon_counts = {k: all_codon_counts.get(k, None) for k in codons}
#         single_rscu_dict = calculate_RSCUs(single_aa_codon_counts) #get RSCU dictionary for the aa
#         max_rscu = max(single_rscu_dict.values()) #weights are normalized by the max RSCU
#         for codon, rscu_val in single_rscu_dict.items():
#             if single_rscu_dict[codon] == 0:
#                 weights_dict[codon] = zero_val
#             else:
#                 weights_dict[codon] = rscu_val / max_rscu
#     return weights_dict

# def calculate_cai(single_cds_str, weights_dict):
#     """
#     Calculates the CAI from a gene (string) and a dictionary of codon weight values.
#     """
#     codon_list = tripletize(single_cds_str)
#     weights_list = []
#     for codon in codon_list:
#         weights_list.append(weights_dict.get(codon, None))
#     weights_list =  [i for i in weights_list if i] 
#     cai = stats.gmean(weights_list)
#     return cai

# reference_set_loci = pd.read_csv('../Data/ecoli_info/original_CAI_refset.txt', header=None, names=['locus_tag'])
# ref_set_seqs = list(df.merge(reference_set_loci, on='locus_tag')['cds_seq'])
# cai_weights_dict = calculate_codon_weights(ref_set_seqs)

# cai_vals = []
# for index in df[df['well_behaved']==True].index:
#     cds_seq = df.loc[index]['cds_seq']
#     cai_vals.append(calculate_cai(cds_seq, cai_weights_dict))
    
# df.at[df[df['well_behaved']==True].index, 'CAI'] = cai_vals

**Defining a few simple functions to help out**

In [17]:
def tripletize(cds_string):
    """
    Split a coding sequence into triplets (codons)
    """
    assert len(cds_string) % 3 == 0
    return [cds_string[i:i+3] for i in range(0, len(cds_string), 3)]

def calculate_cai(single_cds_str, weights_dict):
    """
    Calculates the CAI from a gene (string) and a dictionary of codon weight values.
    """
    codon_list = tripletize(single_cds_str)
    weights_list = []
    for codon in codon_list:
        weights_list.append(weights_dict.get(codon, None))
    weights_list =  [i for i in weights_list if i] 
    cai = stats.gmean(weights_list)
    return cai

In [18]:
cai_weights_df = pd.read_csv('../Data/original_CAI_data.txt', header=0, index_col=0, sep='\t')
cai_weights_df.drop(index=['UAA', 'UGA', 'UAG'], inplace=True)
cai_weights_df = cai_weights_df.astype(float)
if organism == 'ecoli':
    cai_weights_dict = cai_weights_df['ecoli_w'].to_dict()
elif organism == 'scer':
    cai_weights_dict = cai_weights_df['scer_w'].to_dict()
else:
    raise Exception('Incorrect organism identifier')

In [19]:
cai_vals = []
for index in df[df['well_behaved']==True].index:
    cds_seq = df.loc[index]['cds_seq'].replace('T', 'U')
    cai_vals.append(calculate_cai(cds_seq, cai_weights_dict))
    
df.at[df[df['well_behaved']==True].index, 'CAI'] = cai_vals

# Add in knowledge of the `tAI` 

The so-called `tRNA adaptation index` is a very orthogonal metric that uses knowledge of the tRNA genes contained within the genome of interest to calculate a coding sequence specific metric of adaptation to this hypothetical tRNA pool. To my knowledge it was originally proposed here:
https://academic.oup.com/nar/article/32/17/5036/1333956

And I am taking the original codon weight values for *E. coli* from here:
https://www.pnas.org/content/107/8/3645

There was a subsequent re-implementation / variation that I thought about taking from:
https://academic.oup.com/bioinformatics/article/33/4/589/2593585

But since that method is woefully inadequate in terms of its accessibility and usability (matlab?!), I'd have to take the actual codon-level weights for this variant from yet another publication that (as best as I can tell) simply re-implemented the same basic model with more transparency:
https://genominfo.org/journal/view.php?doi=10.5808/GI.2018.16.4.e28

This is all a long wind up to say that I should probably re-implement the weight calculations from scratch one day. 

In [20]:
if organism == 'ecoli':    
    weights_df = pd.read_csv('../Data/ecoli_info/NC_000913.3.original_tAI_weights.csv', header=None)
    weights_df = weights_df[weights_df[1].isnull()==False]
    weights_df.columns = ['codon', 'value']
    weights_dict = pd.Series(weights_df['value'].values,index=weights_df['codon']).to_dict()
elif organism =='scer':
    weights_df = pd.read_csv('../Data/scer_info/scer.original_tAI_weights.csv', header=None)
    weights_df = weights_df[weights_df[1].isnull()==False]
    weights_df.columns = ['codon', 'value']
    weights_dict = pd.Series(weights_df['value'].values,index=weights_df['codon']).to_dict()
else:
    raise Exception('Incorrect organism identifier')
print(weights_dict)

{'TTT': 0.27032, 'TTC': 0.615764, 'TTA': 0.43103400000000003, 'TTG': 0.753695, 'TCT': 0.6773399999999999, 'TCC': 0.48768500000000004, 'TCA': 0.184797, 'TCG': 0.12069, 'TAT': 0.216256, 'TAC': 0.49261099999999997, 'TGT': 0.108128, 'TGC': 0.24630500000000002, 'TGG': 0.369458, 'CTT': 0.027032, 'CTC': 0.061576, 'CTA': 0.184729, 'CTG': 0.059113, 'CCT': 0.123153, 'CCC': 0.08867, 'CCA': 0.615776, 'CCG': 0.197044, 'CAT': 0.189224, 'CAC': 0.43103400000000003, 'CAA': 0.554187, 'CAG': 0.238916, 'CGT': 0.369458, 'CGC': 0.26600999999999997, 'CGA': 3.7e-05, 'CGG': 0.061576, 'ATT': 0.800493, 'ATC': 0.5763550000000001, 'ATA': 0.123233, 'ATG': 0.615764, 'ACT': 0.6773399999999999, 'ACC': 0.48768500000000004, 'ACA': 0.246373, 'ACG': 0.14039400000000002, 'AAT': 0.27032, 'AAC': 0.615764, 'AAA': 0.43103400000000003, 'AAG': 1.0, 'AGT': 0.054064, 'AGC': 0.123153, 'AGA': 0.6773399999999999, 'AGG': 0.278325, 'GTT': 0.8620690000000001, 'GTC': 0.62069, 'GTA': 0.12323900000000002, 'GTG': 0.16256199999999998, 'GCT':

In [21]:
tai_vals = []
for index in df[df['well_behaved']==True].index:
    cds_seq = df.loc[index]['cds_seq']
    codons = tripletize(str(cds_seq))
    tempy = []
    for codon in codons:
        tempy.append(weights_dict.get(codon, None))
    tempy = [i for i in tempy if i]
    tai_vals.append(stats.gmean(tempy))

df.at[df[df['well_behaved']==True].index, 'tAI'] = tai_vals

In [22]:
temp_df = df[df['well_behaved']==True]
for method1 in ['roc_semppr_mean', 'CAI', 'tAI', 'iCUB']:
    print(method1)
    for method2 in ['roc_semppr_mean', 'CAI', 'tAI', 'iCUB']:
        print(method2, stats.spearmanr(temp_df[method1], temp_df[method2]))
    print()


roc_semppr_mean
roc_semppr_mean SpearmanrResult(correlation=1.0, pvalue=0.0)
CAI SpearmanrResult(correlation=0.9250172747260335, pvalue=0.0)
tAI SpearmanrResult(correlation=0.8875837399616455, pvalue=0.0)
iCUB SpearmanrResult(correlation=0.5072941634891676, pvalue=0.0)

CAI
roc_semppr_mean SpearmanrResult(correlation=0.9250172747260335, pvalue=0.0)
CAI SpearmanrResult(correlation=1.0, pvalue=0.0)
tAI SpearmanrResult(correlation=0.9075184803486147, pvalue=0.0)
iCUB SpearmanrResult(correlation=0.4340810273671703, pvalue=5.817828829969477e-247)

tAI
roc_semppr_mean SpearmanrResult(correlation=0.8875837399616455, pvalue=0.0)
CAI SpearmanrResult(correlation=0.9075184803486147, pvalue=0.0)
tAI SpearmanrResult(correlation=1.0, pvalue=0.0)
iCUB SpearmanrResult(correlation=0.4774859983827323, pvalue=1.4419036001928668e-305)

iCUB
roc_semppr_mean SpearmanrResult(correlation=0.5072941634891676, pvalue=0.0)
CAI SpearmanrResult(correlation=0.4340810273671703, pvalue=5.817828829969477e-247)
tAI Spea

# Write data to a file

In [23]:
if organism == 'ecoli':
    df.to_csv('../Data/ecoli_info/current_ecoli_master_table.tsv', sep='\t', index=False)
elif organism == 'scer':
    df.to_csv('../Data/scer_info/current_scer_master_table.tsv', sep='\t', index=False)
else:
    raise Exception('Incorrect organism identifier')

# Scratch

In [24]:
print(df.shape)
df.head()

(5451, 13)


Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,cds_len,well_behaved,GC_percent_cds,roc_semppr_mean,iCUB,CAI,tAI
0,CENPK1137D_4927,Gdh3p,31625.0,32998.0,+,ATGACAAGCGAACCAGAGTTTCAGCAGGCTTACGATGAGATCGTTT...,1374,True,0.485444,1.077532,0.089819,0.166896,0.414135
1,CENPK1137D_4938,Bdh2p,33506.0,34759.0,+,ATGAGAGCCTTAGCGTATTTCGGTAAAGGTAACATCAGATTCACCA...,1254,True,0.484848,0.559819,0.101853,0.146111,0.374852
2,CENPK1137D_4949,Bdh1p,35211.0,36359.0,+,ATGAGAGCTTTGGCATATTTCAAGAAGGGTGATATTCACTTCACTA...,1149,True,0.449956,1.741424,0.105991,0.2393,0.4089
3,CENPK1137D_4960,Ecm1p,36565.0,37203.0,+,ATGTGGGAACAAAGACGACAAAAGGTAGTTTTTTCCTTGACTATAC...,639,True,0.403756,0.997815,0.136281,0.180668,0.399201
4,CENPK1137D_4971,Cne1p,37520.0,39028.0,+,ATGAAATTTTCTGCGTATTTATGGTGGCTGTTTTTGAATCTAGCGT...,1509,True,0.4334,0.164382,0.050166,0.126947,0.323906


In [25]:
df['well_behaved'].value_counts()

True     5398
False      53
Name: well_behaved, dtype: int64