In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

# Making use of Representative Genomes to limit species to include in OMA orthologs

See: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0018910

This step was performed manually, using the "Export marker genes" functionality of OMA (Jan. 2021, see: https://omabrowser.org/oma/export_markers).

Briefly, I selected three datasets: 

1. From *all* gammaproteobacteria, I used the rgg-15 database
2. From Enterobacterales I used the rp75 database
3. From Enterobacteriacaea I used the rp75 database

In each case, I attempted to manually include every genome listed in the associated rgg database below the indicated taxonomy level. This wasn't always possible due to some database discrepancies but resulted in fairly broad coverage.

In each case I exported all marker genes (Maximum nr of markers = -1) and ensured that each ortholog occured in 50% of coverd species (Minimum fraction of covered species: 0.5)

In [185]:
###Went through and tried to find all of the genomes present from this set
# df = pd.read_csv('../Data/ecoli_info/evolutionary_analysis/rgg-15.txt', sep='\t',
#                  encoding = 'ISO-8859-1', header=None)

##For this set I used OMA to first limit taxonomy then for each genus tried to find
##species in this dataframe (given that the numbers were so much larger and include
##lots of species outside of my planned for taxonomy)
df = pd.read_csv('../Data/ecoli_info/evolutionary_analysis/rgg-75.txt', sep='\t',
                 encoding = 'ISO-8859-1', header=None)

print(df.shape)
df = df[df[0].str[0]=='>']
print(df.shape)
df = df[df[8] == 'Bac/Gamma-proteo']
print(df.shape)
df = df[df[6] != '9GAMM']
print(df.shape)
df.head()

(3596, 12)
(2564, 12)
(326, 12)
(294, 12)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
5,>UP000000225,16723,58631.0,382245.0,Aeromonas salmonicida subsp. salmonicida A449,382245,AERS4,Aeromonas salmonicida (strain A449),Bac/Gamma-proteo,"27112.20073(PPS:0,1,3,12.20,4154)",75(CUTOFF),
6,>UP000000229,17457,58723.0,399739.0,Pseudomonas mendocina ymp,399739,PSEMY,Pseudomonas mendocina (strain ymp),Bac/Gamma-proteo,"37111.25085(PPS:1,1,0,11.22,4529)",75(CUTOFF),RefP
7,>UP000000230,17461,58727.0,399742.0,Enterobacter sp. 638,399742,ENT38,Enterobacter sp (strain 638).,Bac/Gamma-proteo,"37113.95578(PPS:1,1,2,14.12,4218)",75(CUTOFF),RefP
11,>UP000000233,16817,58641.0,379731.0,Pseudomonas stutzeri A1501,379731,PSEU5,Pseudomonas stutzeri (strain A1501),Bac/Gamma-proteo,"37112.05181(PPS:1,1,1,12.08,4077)",75(CUTOFF),RefP
18,>UP000000238,16064,58483.0,349521.0,Hahella chejuensis KCTC 2396,349521,HAHCH,Hahella chejuensis (strain KCTC 2396),Bac/Gamma-proteo,"37108.46491(PPS:1,1,2,8.16,6731)",75(CUTOFF),RefP


**Manually looking for certain names in the rgg database, used when finding species to include in the Enterobacterales and Enterobactericaea clades**

In [None]:
for i in df.index:
    if 'Wigglesworthia' in df.loc[i][4]:
        print(df.loc[i][4], '*****', df.loc[i][6])
#     print(df.loc[i][4], '*****', df.loc[i][6])

# Selecting group and species sets to move forward with

Parse through the downloaded orthologs and try to find a more limited set of: 

1. species and 
2. ortholog families

to move forward with that would ensure relatively similar trees

In [16]:
from Bio import SeqIO
import glob
from collections import Counter, defaultdict
import gzip
import subprocess

In [210]:
# data_loc = '../Data/ecoli_info/evolutionary_analysis/marker_genes_rp15_gammaproteobacteria/'
# data_loc = '../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacterales/'
data_loc = '../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/'
fasta_files = glob.glob(data_loc + '*.fa')
print(len(fasta_files))

2787


**First getting the number of genes per genome, and removing genomes with too few hits**

In [211]:
###Dictionary to hold hit counts per species
species_count_dict = defaultdict(int)

for fasta_file in fasta_files[:]:
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    ids = [record.id[:5] for record in records]
    id_counter = Counter(ids)
    for single_id in id_counter:
        species_count_dict[single_id]+=1
        
temp_items = species_count_dict.items()
listy = sorted(temp_items, key=lambda x: x[1])
max_val = listy[-1][1] ###Identify the largest number of hits
to_delete = [i[0] for i in listy if i[1] < max_val*0.8] ###Remove species below this threshold
print('Species to remove:', to_delete)
valid_species = [i[0] for i in listy if i[0] not in to_delete]
valid_species_count = len(valid_species)
print('Number of species to keep:', valid_species_count)

Species to remove: ['RIEPU', 'MOREP', 'BLOFL', 'BLOVB', 'BLOPB', 'HAMD5']
Number of species to keep: 23


**Given the newly limited set of species, identify high coverage ortholog families (i.e. those with representatives in most genomes**

In [212]:
###What I ultimately wish to keep
valid_fasta_files = []

###Dictionary to hold hit counts per genome set
og_count_dict = defaultdict(int)
for fasta_file in fasta_files[:]:
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    ids = [record.id[:5] for record in records]
    
    ###Select only from the valid species list
    ids = sorted([single_id for single_id in ids if single_id not in to_delete])
    if len(ids) < valid_species_count * 0.8: ###Remove orthologs below this threshold
        continue
    valid_fasta_files.append(fasta_file)
    
    ###Putting together the genome set for the ortholog
    long_id = '_%_'.join(ids)
    og_count_dict[long_id] += 1
    
print(np.sum(list(og_count_dict.values())))
print(len(valid_fasta_files))

2324
2324


**Inspecting the genome set hits (solely as a check / out of curiosity)**

In [213]:
temp_items = og_count_dict.items()
listy = sorted(temp_items, key=lambda x: x[1], reverse=True)
for i in listy[:5]:
    print(i)
    print()

('CITK8_%_CITRI_%_CROS8_%_ECOK1_%_ECOL6_%_ECOLI_%_ENT38_%_ENTAL_%_ENTBF_%_ENTCC_%_ENTLS_%_ESCF3_%_KLEOK_%_KLEP3_%_KLEP7_%_SALAR_%_SALBC_%_SALHS_%_SALTY_%_SHIB3_%_SHIBC_%_SHIDS_%_SHIFL', 1361)

('CITK8_%_CITRI_%_CROS8_%_ECOL6_%_ECOLI_%_ENT38_%_ENTAL_%_ENTBF_%_ENTCC_%_ENTLS_%_ESCF3_%_KLEOK_%_KLEP3_%_KLEP7_%_SALAR_%_SALBC_%_SALHS_%_SALTY_%_SHIB3_%_SHIBC_%_SHIDS_%_SHIFL', 85)

('CITK8_%_CITRI_%_CROS8_%_ECOK1_%_ECOL6_%_ECOLI_%_ENT38_%_ENTAL_%_ENTBF_%_ENTCC_%_ENTLS_%_ESCF3_%_KLEOK_%_KLEP3_%_KLEP7_%_SALAR_%_SALBC_%_SALHS_%_SALTY_%_SHIB3_%_SHIBC_%_SHIFL', 70)

('CITK8_%_CITRI_%_CROS8_%_ECOK1_%_ECOL6_%_ECOLI_%_ENT38_%_ENTAL_%_ENTBF_%_ENTCC_%_ENTLS_%_ESCF3_%_KLEOK_%_KLEP3_%_KLEP7_%_SALAR_%_SALBC_%_SALHS_%_SALTY_%_SHIB3_%_SHIDS_%_SHIFL', 50)

('CITK8_%_CITRI_%_CROS8_%_ECOK1_%_ECOL6_%_ECOLI_%_ENT38_%_ENTAL_%_ENTBF_%_ENTCC_%_ENTLS_%_ESCF3_%_KLEOK_%_KLEP3_%_KLEP7_%_SALAR_%_SALBC_%_SALHS_%_SALTY_%_SHIBC_%_SHIDS_%_SHIFL', 46)



**Write \*new\* ortholog fasta files for the ortholog families that passed the test and limit these files to include only the valid set of genomes. Place in a new folder**

In [214]:
for fasta_file in valid_fasta_files:
    print(fasta_file)
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    records = [record for record in records if record.id[:5] in valid_species]
    with open(fasta_file.replace('marker_genes_', ''), 'w') as outfile:
        SeqIO.write(records, outfile, 'fasta')

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593658.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_945534.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_918072.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_909097.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_961790.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593598.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593609.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_920059.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_770899.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_959608.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593639.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_816074.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_958414.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_112366.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_890693.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_902971.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_816087.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_940152.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_838769.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_961850.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_937775.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_865138.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_888093.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_842023.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_828922.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_961840.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_918221.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_926617.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_750335.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_923172.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_887575.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_957247.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_928278.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_957096.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_806772.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_867804.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594747.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_531616.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_876523.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_808730.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_961944.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_887571.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_933771.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_889572.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_959122.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_845289.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_880340.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_955135.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594421.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_894573.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_111688.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_834228.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_922498.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_601197.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_913141.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_866510.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_952548.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_840642.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_939195.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_854313.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_961177.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_733950.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_788618.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_750723.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_918874.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_932750.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_936359.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_949121.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_755201.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_899571.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593575.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_121405.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_951199.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_955710.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_955813.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_892538.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_950135.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_596241.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_955650.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_871686.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593663.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_929902.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_938575.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_123616.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593871.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_894262.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_946974.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_781906.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_943283.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593632.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_763098.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_933579.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_956045.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594629.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_883321.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_916771.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_954111.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_803039.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_950478.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_939382.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593940.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_901643.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594201.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_925554.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_551288.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_728500.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_848880.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594250.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_754804.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_955847.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_705123.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_901846.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_588099.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_600983.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593567.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_941193.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_910613.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594247.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_960452.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593536.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_842872.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_926145.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_932414.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_798865.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_879843.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_722410.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_883242.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_881040.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_941548.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_137709.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593580.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593611.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_909893.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_909401.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_958010.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_935072.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_775578.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_818651.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_909681.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_935163.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_941262.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593696.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593985.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_948691.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_873965.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_126347.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_912925.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_857801.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_126256.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_925473.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_927952.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_929900.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_915052.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_929421.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_766215.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_865337.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593552.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_901552.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_787129.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593956.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_897790.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_902111.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_122061.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_947280.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_951897.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594549.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_894779.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_821923.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_703961.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_939432.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_799742.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_933752.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_850586.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_940939.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_801206.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_874773.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_865895.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_123217.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_876736.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_552751.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_894752.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_829184.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_786444.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_111647.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_810383.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_887572.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_901182.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_950593.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_121363.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_571150.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_958714.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_890694.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_703875.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_124349.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_754968.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_727911.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_961614.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_595001.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_945661.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_734701.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_122913.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_920308.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_957669.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_940380.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_915009.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593789.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_593558.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_937430.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_594338.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_125203.fa
../Data/ecoli_info/evolutionar

../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_530253.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_771933.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_888090.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_942192.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_759421.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_945641.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_570906.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_734692.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_750213.fa
../Data/ecoli_info/evolutionary_analysis/marker_genes_rp75_enterobacteriacaea/OMAGroup_122725.fa
../Data/ecoli_info/evolutionar

**Make sure that I have the coding sequences that I need on hand. These are going to be extracted from the humongous OMA groups cdna file for the species that I care about**

In [216]:
out_dir = '../Data/ecoli_info/evolutionary_analysis/CODING_SEQUENCES/'

###Make sure that I don't repeat species since this takes some time to run
finished_species = [i.split('/')[-1][:-10] for i in glob.glob(out_dir+'*')]
print('Finished species:\n', finished_species)
print()
species_to_write = [i for i in valid_species if i not in finished_species]
print('New species to write:\n', species_to_write)

Finished species:
 ['CELJU', 'ESCF3', 'XANCP', 'KLEP3', 'KLEOK', 'SHIB3', 'HALNC', 'CYCSP', 'ERWBE', 'NITOC', 'CROS8', 'ECOL6', 'LEGPH', 'IDILO', 'ECOK1', 'ERWT9', 'ALKEH', 'ALCBS', 'ENTCC', 'METCA', 'PANVC', 'YERPE', 'PANAA', 'METFJ', 'SALHS', 'EDWTF', 'DICD3', 'SHIBC', 'RAHAC', 'MARMS', 'SHIFL', 'PSEAE', 'SHIDS', 'SERP5', 'SALTY', 'SERFO', 'SERSA', 'DICZ5', 'PECAS', 'SHEON', 'COLP3', 'ENTBF', 'ENTLS', 'SALAR', 'FRAAD', 'KLEP7', 'ENT38', 'KANKD', 'ACIAD', 'CITK8', 'ERWPE', 'HAHCH', 'HALED', 'CITRI', 'YERE8', 'ERWAC', 'VIBCH', 'SALBC', 'ECOLI', 'ENTAL', 'PANSA']

New species to write:
 []


**Slowly iterate through the huge cDNA file and for every record see if it's in the species list that I care about, if so... save it in a dictionary and at the end write each new file**

In [217]:
if species_to_write != []:
    cds_dicty = defaultdict(dict)
    with gzip.open('../Data/ecoli_info/evolutionary_analysis/prokaryotes.cdna.fa.gz', 'rt') as infile:
        for record in SeqIO.parse(infile, 'fasta'):
            if record.id[:5] not in species_to_write:
                continue
            cds_dicty[record.id[:5]][record.id] = record
    for key, seq_dict in cds_dicty.items():
        fname = '../Data/ecoli_info/evolutionary_analysis/CODING_SEQUENCES/{}.cds.fasta'.format(key)
        with open(fname, 'w') as outfile:
            SeqIO.write(list(seq_dict.values()), outfile, 'fasta')

# Create corresponding CDS files for each OMA aa file

In [218]:
all_cds_seqs = {}
for genome_file in glob.glob('../Data/ecoli_info/evolutionary_analysis/CODING_SEQUENCES/*.cds.fasta'):
    records = SeqIO.parse(genome_file, 'fasta')
    for record in records:
        all_cds_seqs[record.id] = record
print(len(all_cds_seqs.keys()))

256891


In [219]:
good_files = glob.glob(data_loc.replace('marker_genes_', '')+'*.fa')
print('Number of amino acid fasta files to work with:', len(good_files))
for fa_file in good_files[:]:
    aa_records = list(SeqIO.parse(fa_file, 'fasta'))
    cds_records = [all_cds_seqs[record.id] for record in aa_records]
    assert len(aa_records) == len(cds_records) ###Make sure no errors popped up
    with open(fa_file.replace('.fa', '.cds.fna'), 'w') as outfile:
        for i, record in enumerate(aa_records):
            outfile.write('>{}\n'.format(record.description))
            outfile.write('{}\n'.format(str(cds_records[i].seq)))

Number of amino acid fasta files to work with: 2324


# Run MAFFT to align amino acid sequences

Just doing a basic MAFFT analysis, could consider some more intense parameters at some point

In [220]:
for fa_file in good_files:
    aligned_file = fa_file.replace('.fa', '.aln')
    subprocess.call('mafft {} > {}'.format(fa_file, aligned_file), shell=True)

# Get nucleotide alignments from aligned amino acid sequences and raw nucleotide sequences using `pal2nal`.

I thought about doing this manually before, but seems like this program is pretty standard?

In [221]:
print(len(good_files))
for fa_file in good_files[:]:
    aln_file = fa_file.replace('.fa', '.aln')
    cds_file = fa_file.replace('.fa', '.cds.fna')
    cds_align = cds_file.replace('.fna', '.aln')
    with open(cds_align, 'w') as outfile:
        subprocess.call('~/workspace/pal2nal/pal2nal.pl -output fasta -nostderr {} {}'.format(aln_file, cds_file),
                        stdout=outfile,
                        shell=True)

2324


# Run FastTree to make some trees from the amino acid sequences

Note the `-nosupport` flag.

I think it's better to make / use amino acid trees rather than the nucleotide level trees. Also note that I construct gene trees rather than doing a concatenated analysis and creating a large species tree (which is something that I could consider given the large number of orthologs that appear in all genomes that I selected, regardless of dataset). 

In [222]:
for fa_file in good_files[:]:
    print(fa_file)
    aln_file = fa_file.replace('.fa', '.aln')
    tree_file = fa_file.replace('.fa', '.newick')
    with open(tree_file, 'w') as outfile:
        subprocess.call('~/workspace/FastTree/FastTree -nosupport -lg {}'.format(aln_file),
                        stdout=outfile,
                        shell=True)

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593658.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_945534.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_918072.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_909097.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961790.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593598.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593609.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_920059.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_770899.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_959608.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_125043.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_55

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_936394.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961058.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_808797.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_712411.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_958540.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_861960.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_816302.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961716.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_947965.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_834929.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_951263.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_89

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_755118.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_867920.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_960076.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_919493.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_941461.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_948253.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_935229.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_550790.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_759319.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_585641.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_764406.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_93

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_902551.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_594158.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_957558.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_594149.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_911731.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_950338.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_122570.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_759625.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_920068.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_929959.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_816075.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_55

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_894745.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_566469.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_595588.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_784696.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_942588.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_957323.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_806496.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_962022.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_833517.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_949338.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_893333.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_90

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_923988.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_950423.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_792274.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_763041.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_887582.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_943784.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_959116.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_944275.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_919830.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_713387.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_865690.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_87

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_923540.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_745089.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_930981.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_122013.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_111688.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_834228.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_922498.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_601197.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_913141.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_866510.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_952548.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_84

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_704208.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_901642.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_845989.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_891916.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_594340.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_936827.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_551289.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_934730.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_903534.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_835529.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_836740.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_84

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_946801.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593607.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_531126.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_944844.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_594127.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_810894.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_917253.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593955.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_958659.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593656.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593815.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_79

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_873239.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_930700.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_796804.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961080.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_887899.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_813242.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_949291.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_923493.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_954263.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_847167.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_888624.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_95

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_596007.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_912513.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_902888.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_960905.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_904056.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593733.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_957923.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_876294.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_754677.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_594320.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_834977.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_59

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_944785.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_946511.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_714046.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593785.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593554.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_126244.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_943287.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_808031.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_799196.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_862870.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_950227.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_94

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_927018.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_937051.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_957208.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_947380.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961021.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_734087.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_923942.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_940620.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_894739.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_901229.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_814561.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_94

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_955470.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_925771.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_874355.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_751187.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_571591.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_922356.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_836772.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593543.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_124356.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_894113.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_939479.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_83

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593953.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_923269.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_882817.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_903916.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593532.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_808275.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_801507.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_857776.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_929550.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_920362.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593563.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_72

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_830844.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_958164.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_828250.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_888474.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_848398.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_959348.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_958024.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_954211.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_842833.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_909586.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_957138.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_95

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_817245.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593867.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_959407.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_795313.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_944976.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_946873.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_901976.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_869147.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593735.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_943252.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_934733.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_95

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_781904.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_937618.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593873.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_594110.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593933.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_862876.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_941155.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_887040.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_836681.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_594141.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593822.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_92

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_784659.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_899923.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_814940.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_935166.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_940990.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_796816.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_918873.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_949003.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_933588.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_897790.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_902111.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_12

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_933752.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_850586.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_940939.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_801206.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_874773.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_865895.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_123217.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_876736.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_552751.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_918878.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_865674.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_87

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_829184.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_786444.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_111647.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_810383.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_887572.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_901182.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_950593.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_121363.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_571150.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_788740.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_806710.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_59

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_593909.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_956657.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_760410.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_918071.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_958714.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_890694.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_703875.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_124349.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_754968.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_727911.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961614.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_59

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_125306.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_754408.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_816172.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_551307.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_712503.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_949637.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961956.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_554611.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_918226.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_961655.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_792949.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_58

../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_803044.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_721897.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_952801.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_595908.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_815820.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_127878.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_917900.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_898802.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_909618.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_550880.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_702200.fa
../Data/ecoli_info/evolutionary_analysis/rp75_enterobacteriacaea/OMAGroup_83

# Offline: run programs to get evolutionary rates!

This is the payoff but also a real treat, given that both programs are astonishingly opaque and difficult to work with. I separately considered both `hyphy` and `codeml` (from `paml`). 

For `hyphy`, this is a pretty nice resource: https://stevenweaver.github.io/hyphy-site/tutorials/current-release-tutorial/ but note that the actual terminal level commands end up looking like this:
`(echo 5; echo 1; echo 1; echo /ABSOLUTE/PATH/TO/ALIGNMENT.CDS.ALN; echo MG94CUSTOMCF3X4; echo 2; echo 012345; echo /ABSOLUTE/PATH/TO/TREE.NEWICK; echo 1) | hyphy | tail -n 11 | head -n 9 > output.txt`

All of the `echo`-ing is to avoid the interactive interface, and the `tail` to `head` to output is all just to get a simple and clean output file that I can parse. 


In [84]:
my_command = '(echo 5; echo 1; echo 1; '+\
'echo /Users/adamhockenberry/Projects/Growth_expression_translation/Data/OMAGroup_961791.cds.aln; '+\
'echo MG94CUSTOMCF3X4; echo 2; echo 012345; '+\
'echo /Users/adamhockenberry/Projects/Growth_expression_translation/Data/OMAGroup_961791.newick; '+\
'echo 1) | hyphy | tail -n 11 | head -n 9'
print(my_command)


(echo 5; echo 1; echo 1; echo /Users/adamhockenberry/Projects/Growth_expression_translation/Data/OMAGroup_961791.cds.aln; echo MG94CUSTOMCF3X4; echo 2; echo 012345; echo /Users/adamhockenberry/Projects/Growth_expression_translation/Data/OMAGroup_961791.newick; echo 1) | hyphy | tail -n 11 | head -n 9
(echo 5; echo 1; echo 1; echo hi; echo MG94CUSTOMCF3X4; echo 2; echo 012345; echo hi; echo 1) | hyphy | tail -n 11 | head -n 9


In [88]:
base_dir = '/Users/adamhockenberry/Projects/Growth_expression_translation/Data/ecoli_info/evolutionary_analysis/rp15_gammaproteobacteria/'

for cds_aln_file in glob.glob(base_dir + '*.cds.aln')[:1]:
    print(cds_aln_file)
    tree_file = cds_aln_file.replace('.cds.aln', '.newick')
    output_file = cds_aln_file.replace('.cds.aln', '.Basic.Results')
    my_command = '(echo 5; echo 1; echo 1; '+\
                'echo {}; '.format(cds_aln_file)+\
                'echo MG94CUSTOMCF3X4; echo 2; echo 012345; '+\
                'echo {}; '.format(tree_file)+\
                'echo 1) | hyphy | tail -n 11 | head -n 9'
    with open(output_file, 'w') as outfile:
        subprocess.call(my_command, stdout=outfile, shell=True)

/Users/adamhockenberry/Projects/Growth_expression_translation/Data/ecoli_info/evolutionary_analysis/rp15_gammaproteobacteria/OMAGroup_799197.cds.aln


# Read HYPHY output files and construct rates column

In [156]:
master_df = pd.read_csv('../Data/ecoli_info/current_ecoli_master_table.tsv', sep='\t')
master_df.shape
master_df.head()

Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,us_seq,cds_len,well_behaved,GC_percent_cds,roc_semppr_mean,iCUB,CAI,tAI,stAIcalc,aSD_binding,sec_struct,sec_struct_bound,Start_accessibility
0,b0001,thrL,189,255,+,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,CAGATAAAAATTACAGAGTACACAACATCC,66,True,0.515152,1.244106,32.046035,0.617266,0.262286,0.258417,-2.45,,,
1,b0002,thrA,336,2799,+,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC,2463,True,0.530654,1.034078,55.949262,0.353246,0.230564,0.216862,-5.42,-21.31,-12.81,-8.5
2,b0003,thrB,2800,3733,+,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC,933,True,0.562701,0.994168,56.062386,0.357812,0.216292,0.21042,-6.51,-21.87,-14.05,-7.82
3,b0004,thrC,3733,5020,+,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA,1287,True,0.528361,1.17675,53.052776,0.394675,0.231407,0.209784,-3.4,-24.44,-20.71,-3.73
4,b0005,yaaX,5233,5530,+,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT,297,True,0.538721,0.837528,50.70553,0.374371,0.197715,0.208944,-6.51,-17.15,-7.38,-9.77


In [157]:
ecoli_seq_dict = {}     
base_dir = '../Data/ecoli_info/evolutionary_analysis/rp15_gammaproteobacteria'
for infile in glob.glob(base_dir+ '/*.cds.fna')[:]:
    oma_name = infile.split('/')[-1].split('.')[0]
    records = list(SeqIO.parse(infile, 'fasta'))
    for record in records:
        if record.id[:5] == 'ECOLI':
            ecoli_seq_dict[oma_name] = str(record.seq)
print(len(oma_to_ecoli.keys()), len(ecoli_seq_dict.keys()))

557 557


In [158]:
ecoli_oma_to_bnumber = {}
for name, seq in ecoli_seq_dict.items():
    temp = master_df[master_df['cds_seq'] == seq]
    if temp.shape[0] == 1:
        ecoli_oma_to_bnumber[name] = temp.iloc[0]['locus_tag']
print(len(ecoli_oma_to_bnumber.keys()))

556


In [159]:
names = []
rates = []
base_dir = '../Data/ecoli_info/evolutionary_analysis/rp15_gammaproteobacteria'
for infile in glob.glob(base_dir+ '/*.Basic.Results')[:]:
    oma_name = infile.split('/')[-1].split('.')[0]
    try:
        eco_name = ecoli_oma_to_bnumber[oma_name]
    except KeyError:
        continue
        
    df = pd.read_csv(infile, skiprows=3, sep='=', header=None, index_col=0)
    rate = float(df.loc['R'][1].strip(';'))
    rates.append(rate)
    names.append(eco_name)

In [162]:
res_df = pd.DataFrame(zip(names, rates))
res_df.columns = ['locus_tag', 'dnds_rp15_gammaproteobacteria']
print(res_df.shape)
res_df.head()

(204, 2)


Unnamed: 0,locus_tag,dnds_rp15_gammaproteobacteria
0,b3313,0.08248
1,b3177,0.04314
2,b3608,0.062806
3,b1215,0.080233
4,b3390,0.011858


In [164]:
df = master_df.merge(res_df, on='locus_tag')
print(df.shape)
df.head()

(204, 20)


Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,us_seq,cds_len,well_behaved,GC_percent_cds,roc_semppr_mean,iCUB,CAI,tAI,stAIcalc,aSD_binding,sec_struct,sec_struct_bound,Start_accessibility,dnds_rp15_gammaproteobacteria
0,b0023,rpsT,20814,21078,-,TTGGCTAATATCAAATCAGCTAAGAAGCGCGCCATTCAGTCTGAAA...,TCCATATAGAACACATTTGGGAGTTGGACC,264,True,0.465909,2.121792,40.736573,0.679842,0.292095,0.275094,-6.51,-19.96,-10.13,-9.83,0.138181
1,b0029,ispH,26276,27227,+,ATGCAGATCCTGTTGGCCAACCCGCGTGGTTTTTGTGCCGGGGTAG...,TGGAAATCGATCCGGCACTGGAGGCGTAAC,951,True,0.550999,1.524996,49.60524,0.454708,0.267957,0.235874,-8.62,-27.29,-18.15,-9.14,0.010913
2,b0032,carA,29650,30799,+,TTGATTAAGTCAGCGCTATTGGTTCTGGAAGACGGAACCCAGTTTC...,AAAGTGAGTGAATATTCTCTGGAGGGTGTT,1149,True,0.533507,1.240223,54.47554,0.425532,0.238314,0.219563,-9.51,-22.5,-13.18,-9.32,0.105528
3,b0048,folA,49822,50302,+,ATGATCAGTCTGATTGCGGCGTTAGCGGTAGATCGCGTTATCGGCA...,GGCGACAATTTTTTTTATCGGGAAATCTCA,480,True,0.527083,1.128435,51.023328,0.410548,0.239275,0.225726,-4.34,-21.1,-14.13,-6.97,0.044775
4,b0054,lptD,54754,57109,-,ATGAAAAAACGTATCCCCACTCTCCTGGCCACCATGATTGCCACCG...,TTACCGATGATGGAACAATAAAATCAACGT,2355,True,0.5138,1.337872,55.726845,0.449006,0.250086,0.226762,-0.01,-10.36,-5.91,-4.45,0.039936


In [180]:
stats.spearmanr(df['dnds_rp15_gammaproteobacteria'], df['CAI'])

SpearmanrResult(correlation=0.4115851891639751, pvalue=9.612722522632346e-10)

Unnamed: 0,locus_tag,gene,start_loc,stop_loc,strand,cds_seq,us_seq,cds_len,well_behaved,GC_percent_cds,roc_semppr_mean,iCUB,CAI,tAI,stAIcalc,aSD_binding,sec_struct,sec_struct_bound,Start_accessibility
0,b0001,thrL,189,255,+,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,CAGATAAAAATTACAGAGTACACAACATCC,66,True,0.515152,1.244106,32.046035,0.617266,0.262286,0.258417,-2.45,,,
1,b0002,thrA,336,2799,+,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC,2463,True,0.530654,1.034078,55.949262,0.353246,0.230564,0.216862,-5.42,-21.31,-12.81,-8.5
2,b0003,thrB,2800,3733,+,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC,933,True,0.562701,0.994168,56.062386,0.357812,0.216292,0.21042,-6.51,-21.87,-14.05,-7.82
3,b0004,thrC,3733,5020,+,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA,1287,True,0.528361,1.17675,53.052776,0.394675,0.231407,0.209784,-3.4,-24.44,-20.71,-3.73
4,b0005,yaaX,5233,5530,+,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT,297,True,0.538721,0.837528,50.70553,0.374371,0.197715,0.208944,-6.51,-17.15,-7.38,-9.77


'ATGGAAAAACCAAGAGTACTCGTACTGACAGGGGCAGGAATTTCTGCGGAATCAGGTATTCGTACCTTTCGCGCCGCAGATGGCCTGTGGGAAGAACATCGGGTTGAAGATGTGGCAACGCCGGAAGGTTTCGATCGCGATCCTGAACTGGTGCAAGCGTTTTATAACGCCCGTCGTCGACAGCTGCAGCAGCCAGAAATTCAGCCTAACGCCGCGCATCTTGCGCTGGCTAAACTGCAAGACGCCCTCGGCGATCGCTTTTTGCTGGTGACGCAGAATATCGACAACCTGCATGAACGCGCAGGTAATACCAATGTGATTCATATGCATGGGGAACTGCTGAAAGTGCGTTGTTCACAAAGTGGTCAGGTTCTCGACTGGACAGGAGACGTTACCCCAGAAGATAAATGCCATTGTTGCCAGTTTCCGGCACCCTTGCGCCCGCACGTAGTGTGGTTTGGCGAAATGCCACTCGGCATGGATGAAATTTATATGGCGTTGTCGATGGCCGATATTTTCATTGCCATTGGCACATCCGGGCATGTTTATCCGGCGGCTGGGTTTGTTCACGAAGCGAAACTGCATGGCGCGCACACCGTGGAACTGAATCTTGAACCGAGTCAGGTTGGTAATGAATTTGCCGAGAAATATTACGGCCCGGCAAGCCAGGTGGTGCCTGAGTTTGTTGAAAAGTTGCTGAAGGGATTAAAAGCGGGAAGCATTGCCTGA'

In [199]:
np.mean(dn)/np.mean(ds)

0.06115782388512575

In [200]:
np.array(dn)/np.array(ds)

array([0.06117647, 0.06080526, 0.06106243, 0.06119163, 0.06117559,
       0.06117633, 0.06114106, 0.06110777, 0.06135693, 0.06115214,
       0.06117261, 0.0611777 , 0.06137931, 0.061163  , 0.06089744,
       0.06118158, 0.0611357 , 0.06123182, 0.06090652, 0.06119852,
       0.06115047, 0.06112469, 0.06119486, 0.06109944, 0.06112584,
       0.06111878, 0.06107515, 0.06129329, 0.06114692, 0.06121495,
       0.0611484 , 0.06114415, 0.06111011, 0.06114925, 0.06096794,
       0.06118757, 0.06094364, 0.06115978, 0.06118267, 0.06118472,
       0.06118721])

In [201]:
np.mean(np.array(dn)/np.array(ds))

0.06113901606478997