In [1]:
%matplotlib inline

In [2]:
import glob
from collections import Counter
from Bio import SeqIO
import pandas as pd
from Bio.Data import IUPACData 
import os.path; os.rename
from matplotlib import pyplot as plt

# Get relevant genomes and write FAA and FNA files

In [3]:
def genbank_to_faa_and_fna(genbank_filename, ignore_plasmid=True):
    genome_name = genbank_filename.split('/')[-1].rstrip('.PATRIC.gbf')
    faa_filename = genbank_filename.replace('gbf', 'faa').replace('GBFs', 'FAAs')
    fna_filename = genbank_filename.replace('gbf', 'fna').replace('GBFs', 'FNAs')
    
    output_handle_faa = open(faa_filename, "w")
    output_handle_fna = open(fna_filename, "w")


    for seq_record in SeqIO.parse(genbank_filename, "genbank"):
        if ignore_plasmid:
            if seq_record.description.lower().find('plasmid') != -1:
                continue
        for seq_feature in seq_record.features[:]: #Use slicing to truncate search for testing purposes
            if seq_feature.type=="CDS":
                strand = seq_feature.strand
                beg = seq_feature.location.start
                end = seq_feature.location.end
                if strand == 1:
                    nt_seq = seq_record.seq[beg:end]
                elif strand == -1:
                    nt_seq = seq_record.seq[beg:end].reverse_complement()
                else:
                    print('catastrophic error')
                if len(nt_seq) > 90 and len(nt_seq) % 3 == 0:
                    aa_seq = nt_seq.translate()
                    aa_seq = aa_seq[:-1]
                    nt_seq = nt_seq[:-3]
                    if len(set(str(aa_seq)+IUPACData.protein_letters))!= 20:
                        continue
                    if len(set(str(nt_seq)+IUPACData.unambiguous_dna_letters))!= 4:
                        continue
                    output_handle_faa.write(">{}|{}\n{}\n".format(
                            seq_feature.qualifiers['locus_tag'][0], 
                            genome_name, 
                            str(aa_seq)))
                    output_handle_fna.write(">{}|{}\n{}\n".format(
                            seq_feature.qualifiers['locus_tag'][0], 
                            genome_name, 
                            str(nt_seq)))

    output_handle_faa.close()
    output_handle_fna.close()
    return

In [4]:
genome_grouping = 'representative'
df_rep = pd.read_csv('../../Genome_database/Data/Dataframes/PATRIC_genome_complete_bacteria_{}_taxon.csv'.format(genome_grouping), index_col = 'Genome ID')
df_rep = df_rep[(df_rep['order']=='Enterobacterales')  & (df_rep['PATRIC CDS'] > 2000)]
print(len(df_rep.index))

87


In [5]:
genome_grouping = 'reference'
df_ref = pd.read_csv('../../Genome_database/Data/Dataframes/PATRIC_genome_complete_bacteria_{}_taxon.csv'.format(genome_grouping), index_col = 'Genome ID')
df_ref = df_ref[(df_ref['order']=='Enterobacterales')  & (df_ref['PATRIC CDS'] > 2000)]
print(len(df_ref.index))

15


In [6]:
df = pd.concat([df_rep, df_ref])
print(len(df.index), len(list(set(list(df.index)))))

102 102


In [7]:
species_list = list(set(list(df[df['species'].isnull()==False]['species'])))
print(len(species_list))

54


In [8]:
rows_to_drop = []
for index in df.index:
    genbank_filename = '../../Genome_database/Data/GBFs/{}.PATRIC.gbf'.format(index)
    if os.path.isfile(genbank_filename) == False:
        rows_to_drop.append(index)
df = df.drop(rows_to_drop)

In [9]:
rows_to_drop = []

for index in df.index:
    if index in [511145.12, 99287.12]:
        pass
    elif df.loc[index]['species'] in ['Escherichia coli', 'Salmonella enterica']:
        rows_to_drop.append(index)

for species in species_list:
    if species in ['Escherichia coli', 'Salmonella enterica']:
        continue
    reference = False
    hits = []
    for index in df[df['species'].isnull()==False].index:
        if df.loc[index]['species'] == species:
            hits.append(index)
    for hit in hits:
        if hit in df_ref.index:
            reference = True
    if reference == True:
        for hit in hits:
            if hit not in df_ref.index:
                rows_to_drop.append(hit)
    else:
        for hit in hits[1:]:
            rows_to_drop.append(hit)
df = df.drop(rows_to_drop)

In [10]:
len(df.index)

62

In [None]:
for index in df.index[:]: #Use slicing to truncate data for testing
    genbank_filename = '../../Genome_database/Data/GBFs/{}.PATRIC.gbf'.format(index)
    print(index)
    if os.path.isfile(genbank_filename): 
        genbank_to_faa_and_fna(genbank_filename)

# Offline: Run reciprocal blast on the amino acid files to find orthologs

This code is currently in Code/BASH in reciprocal_blast_all.sh
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />

# Push some files around and do some clean-up

In [None]:
for infile in glob.glob('../Data/FAAs/*.faa'):
    os.rename(infile, infile.replace('/FAAs/', '/FAAs/Order_Enterobacterales_Representative/'))
for infile in glob.glob('../Data/FNAs/*.fna'):
    os.rename(infile, infile.replace('/FNAs/', '/FNAs/Order_Enterobacterales_Representative/'))
for infile in glob.glob('../Data/Orthologs/*.u8'):
    os.rename(infile, infile.replace('/Orthologs/', '/Orthologs/Order_Enterobacterales_Representative/'))

In [None]:
print(len(glob.glob('../Data/FAAs/Order_Enterobacterales_Representative/*.faa')))
print(len(glob.glob('../Data/FNAs/Order_Enterobacterales_Representative/*.fna')))
print(len(glob.glob('../Data/Orthologs/Order_Enterobacterales_Representative/*.u8')))

## Parse through all ortholog files

In [11]:
base_genome = '511145.12'
# directory_to_read = '../Data/Orthologs/Order_Enterobacterales_Representative/'
# print(len(glob.glob('../Data/Orthologs/Order_Enterobacterales_Representative/*.u8')))
directory_to_read = '../Data/Orthologs/Order_Enterobacterales_Mixed/'
print(len(glob.glob('../Data/Orthologs/Order_Enterobacterales_Mixed/*.u8')))

120


In [12]:
ecoli_dict = {}
comparison_genome_names = []
for infile in glob.glob('{}*{}*.u8'.format(directory_to_read, base_genome))[:]:
    comparison_genome = infile.split('/')[-1].replace(base_genome, '').replace('_vs_', '').replace('.u8', '')
    print('##########')
    print(base_genome, comparison_genome)
    comparison_genome_names.append(comparison_genome)
    if infile.find(base_genome) < infile.find('_vs_'):
        base_entry = 0
        comparison_entry = 1
    elif infile.find(base_genome) > infile.find('_vs_'):
        base_entry = 1
        comparison_entry = 0
    else:
        print('Error, investigate')
        break
    
    temp_dict = {}
    with open(infile) as blastdata:
        for line in blastdata:
            split_line = line.split('\t')
            try:
                temp_dict[split_line[base_entry]].append((split_line[comparison_entry], float(split_line[2])))    
            except KeyError:
                temp_dict[split_line[base_entry]] = [(split_line[comparison_entry], float(split_line[2]))]

    for i,j in temp_dict.items():
        if len(j) == 1:
            try:
                ecoli_dict[i].append(j[0][0])
            except KeyError:
                ecoli_dict[i] = [j[0][0]]
        elif len(j) > 1:
            try:
                ecoli_dict[i].append(sorted(j, key=lambda x: x[1])[-1][0])
            except KeyError:
                ecoli_dict[i] = [sorted(j, key=lambda x: x[1])[-1][0]]

##########
511145.12 1006551.4
##########
511145.12 1028307.3
##########
511145.12 1076550.3
##########
511145.12 1125630.4
##########
511145.12 1157951.4
##########
511145.12 1166016.3
##########
511145.12 1235834.6
##########
511145.12 1239307.3
##########
511145.12 1249634.3
##########
511145.12 1286170.3
##########
511145.12 1333848.3
##########
511145.12 1441930.4
##########
511145.12 1484157.3
##########
511145.12 158822.6
##########
511145.12 158822.7
##########
511145.12 198214.7
##########
511145.12 198628.6
##########
511145.12 214092.21
##########
511145.12 215689.3
##########
511145.12 218491.5
##########
511145.12 218493.5
##########
511145.12 243265.5
##########
511145.12 290338.8
##########
511145.12 290339.8
##########
511145.12 291112.3
##########
511145.12 300267.13
##########
511145.12 300268.11
##########
511145.12 300269.12
##########
511145.12 343509.12
##########
511145.12 393305.7
##########
511145.12 399741.7
##########
511145.12 406818.4
##########
511145.12 4

In [13]:
print(len(comparison_genome_names), len(set(comparison_genome_names)))

120 60


In [14]:
listy = []
for i,j in list(ecoli_dict.items())[:]:
    listy.extend([record.split('|')[-1] for record in j])
print(Counter(listy))
print(len(Counter(listy).keys()))

Counter({'300269.12': 7661, '198214.7': 7527, '300268.11': 7297, '300267.13': 6917, '585054.5': 6638, '1333848.3': 6277, '290338.8': 6030, '637910.3': 5986, '99287.12': 5896, '218493.5': 5770, '716541.4': 5548, '693444.3': 5541, '701347.4': 5527, '640513.3': 5412, '1006551.4': 5374, '1125630.4': 5300, '1028307.3': 5287, '640131.3': 5251, '1286170.3': 5248, '1235834.6': 5102, '158822.6': 4694, '158822.7': 4627, '290339.8': 4612, '693216.3': 4593, '630626.3': 4148, '399741.7': 3523, '1484157.3': 3515, '768490.3': 3407, '768492.3': 3405, '634500.5': 3364, '1249634.3': 3339, '1441930.4': 3255, '592316.4': 3252, '393305.7': 3219, '1076550.3': 3219, '745277.3': 3193, '741091.4': 3191, '553.3': 3174, '561230.3': 3107, '218491.5': 3093, '1166016.3': 3080, '561231.5': 3076, '502801.6': 3052, '214092.21': 3046, '198628.6': 2910, '561229.3': 2892, '465817.9': 2892, '215689.3': 2863, '665029.3': 2841, '634499.3': 2835, '579405.3': 2811, '1239307.3': 2703, '498217.4': 2694, '634503.3': 2605, '34350

## Use above information to delete genomes with an outlying few number of orthologs before proceeding

You MUST go back up and re-run the ecoli_dict code in order to proceed after you delete any genomes

In [None]:
bad_apples = ['572265.5']
for bad_apple in bad_apples:
    os.remove('../Data/FNAs/{}.PATRIC.fna'.format(bad_apple))
    os.remove('../Data/FAAs/{}.PATRIC.faa'.format(bad_apple))
    for trash_file in glob.glob('../Data/Orthologs/*{}*.u8'.format(bad_apple)):
        os.remove(trash_file)

## Figure out how many orthologs we'll have

In [15]:
looking_good = 0
for i,j in list(ecoli_dict.items())[:]:
    temp_counter = Counter(j)
    if len(set(temp_counter.values())) == 1:
        if list(set(temp_counter.values()))[0] == 2:
            if len(temp_counter.keys()) >= int(0.70 * len(set(comparison_genome_names))): #Tunable paramater HERE
                    looking_good += 1
#                     print(temp_counter)
print(looking_good)

1394


## Read all amino acid squences into a giant dictionary

In [16]:
aa_sequence_dict = {}
for i in glob.glob('../Data/FAAs/*.faa')[:]:
    print(i)
    records = SeqIO.parse(i, 'fasta')
    for record in records:
        aa_sequence_dict[record.description] = str(record.seq)

../Data/FAAs/1006551.4.PATRIC.faa
../Data/FAAs/1028307.3.PATRIC.faa
../Data/FAAs/1076550.3.PATRIC.faa
../Data/FAAs/1125630.4.PATRIC.faa
../Data/FAAs/1157951.4.PATRIC.faa
../Data/FAAs/1166016.3.PATRIC.faa
../Data/FAAs/1235834.6.PATRIC.faa
../Data/FAAs/1239307.3.PATRIC.faa
../Data/FAAs/1249634.3.PATRIC.faa
../Data/FAAs/1286170.3.PATRIC.faa
../Data/FAAs/1333848.3.PATRIC.faa
../Data/FAAs/1441930.4.PATRIC.faa
../Data/FAAs/1484157.3.PATRIC.faa
../Data/FAAs/158822.6.PATRIC.faa
../Data/FAAs/158822.7.PATRIC.faa
../Data/FAAs/198214.7.PATRIC.faa
../Data/FAAs/198628.6.PATRIC.faa
../Data/FAAs/214092.21.PATRIC.faa
../Data/FAAs/215689.3.PATRIC.faa
../Data/FAAs/218491.5.PATRIC.faa
../Data/FAAs/218493.5.PATRIC.faa
../Data/FAAs/243265.5.PATRIC.faa
../Data/FAAs/290338.8.PATRIC.faa
../Data/FAAs/290339.8.PATRIC.faa
../Data/FAAs/291112.3.PATRIC.faa
../Data/FAAs/300267.13.PATRIC.faa
../Data/FAAs/300268.11.PATRIC.faa
../Data/FAAs/300269.12.PATRIC.faa
../Data/FAAs/343509.12.PATRIC.faa
../Data/FAAs/393305.7.PAT

## And create new .fasta files that can be used by MUSCLE

In [None]:
for i,j in list(ecoli_dict.items())[:]:
    temp_counter = Counter(j)
    if len(set(temp_counter.values())) == 1:
        if list(set(temp_counter.values()))[0] == 2:
            if len(temp_counter.keys()) >= int(0.70 * len(set(comparison_genome_names))):
                with open('../Data/Orthologs/Order_Enterobacterales_Mixed/fastas/{}.fasta'.format(i.split('|')[0].strip()), 'w') as outfile:
                    outfile.write('>{}\n{}\n'.format(i, aa_sequence_dict[i]))
                    for ortho in temp_counter.keys():
                        outfile.write('>{}\n{}\n'.format(ortho, aa_sequence_dict[ortho]))

# Offline: Run MUSCLE on the amino acid ortholog files to get .mfastas

Look into MAFFT

<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />


# Read and parse (some of) the .mfastas to write concatenated, aligned aa seqs for use in making an amino acid tree for overall topology purposes

This is kind of limited by the power of my computer right now. Would love to submit them all to RAxML but it'll take too long

In [None]:
all_files = glob.glob('../Data/FAAs/*.faa')
len(all_files)

In [None]:
no_indels = []
for i in glob.glob('../Data/Orthologs/Order_Enterobacterales_Mixed/mfastas/*.mfasta')[:]:
#     print(i)
    records = list(SeqIO.parse(i, 'fasta'))
    total_indel_locs = []
    for record in records:
        total_indel_locs.append(str(record.seq).count('-'))
    if sum(total_indel_locs) <= 0.01*len(records)*len(records[0].seq) and len(records) == len(all_files):
        no_indels.append(i)
print(len(no_indels))

In [None]:
full_sequence_dict = {}
# for i in glob.glob('../Data/Orthologs/Order_Enterobacterales_Representative/mfastas/*.mfasta')[:]:
for i in no_indels[:]:
    print(i)
    records = SeqIO.parse(i, 'fasta')
    for record in records:
        try:
            full_sequence_dict[record.description.split('|')[-1].strip()] += str(record.seq)
        except KeyError:
            full_sequence_dict[record.description.split('|')[-1].strip()] = str(record.seq)

In [None]:
len(full_sequence_dict.keys())

In [None]:
with open('../Data/Tree_files/Order_Enterobacterales_Mixed/full_aligned_aas_108_seqs.mfasta', 'w') as outfile:
    for i,j in full_sequence_dict.items():
        outfile.write('>{}\n{}\n'.format(i,j))

# Offline: Run RAxML on aligned and concatenated aa seqs to make a tree

<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />

# To run phyloFit to get neutral rates, this will remove branch lengths from tree topology

In [None]:
from Bio import Phylo

In [None]:
# tree = Phylo.read('../Data/Tree_files/RAxML_bestTree.FIRST100CDS', 'newick')
# Phylo.write(tree, '../Data/Tree_files/RAxML_bestTree.FIRST100CDSFLAT', 'newick', plain=True);
# tree = Phylo.read('../Data/Tree_files/Order_Enterobacterales_Reference/RAxML_bestTree.ReferenceSet232seqs', 'newick')
# Phylo.write(tree, '../Data/Tree_files/Order_Enterobacterales_Reference/RAxML_bestTree.ReferenceSet232seqsFLAT', 'newick', plain=True);
tree = Phylo.read('../Data/Tree_files/Order_Enterobacterales_Mixed/RAxML_bestTree.MixedSet108seqs', 'newick')
Phylo.write(tree, '../Data/Tree_files/Order_Enterobacterales_Mixed/RAxML_bestTree.MixedSet108seqsFLAT', 'newick', plain=True);

# Next up, take aligned .mfasta files in amino acid form and write their equivalents in nucleotide sequence form

### I wrote this pretty quickly and it's important so go back and test this to make behavior is as expected
### In particular, see how hard it would be to add S, L and R 4-fold redundant codons into this

In [None]:
nt_seq_dict = {}
for genome_file in glob.glob('../Data/FNAs/*.PATRIC.fna'):
    genome_name = genome_file.split('/')[-1].strip('.PATRIC.fna')
    nt_seq_dict[genome_name] = {}
    for record in SeqIO.parse(genome_file, 'fasta'):
        nt_seq_dict[genome_name][record.id] = record

In [None]:
len(list(nt_seq_dict.keys()))

In [None]:
redundant_concat_dict = {}
for i in nt_seq_dict.keys():
    redundant_concat_dict[i] = ''
    
for infile in glob.glob('../Data/Orthologs/Order_Enterobacterales_Mixed/mfastas/*.mfasta')[:]:
    print(infile)
    aln_pro_seq_list = list(SeqIO.parse(infile, format='fasta'))
    all_genomes = list(nt_seq_dict.keys())
    with open(infile.replace('mfastas', 'mfastas_nt'), 'w') as outfile:
        codon_alns = {}
        for i in aln_pro_seq_list:
            genome_name = i.description.split('|')[-1]
            seq_str = str(nt_seq_dict[genome_name][i.id].seq)
            codon_list = [seq_str[nt:nt+3] for nt in range(0, len(seq_str), 3)]
            aa_str = str(nt_seq_dict[genome_name][i.id].seq.translate())
            codon_list_new = []
            minus = 0
            for pos,aa in enumerate(str(i.seq).rstrip('-')):
                if aa_str[pos-minus] == aa:
                    codon_list_new.append(codon_list[pos-minus])
                else:
                    codon_list_new.append('---')
                    minus += 1
            for terminal in range(len(str(i.seq)) - len(str(i.seq).rstrip('-'))):
                codon_list_new.append('---')
            new_nt_seq = ''.join(codon_list_new)
            outfile.write('>{}\n{}\n'.format(genome_name, new_nt_seq))
            all_genomes.remove(genome_name)
            codon_alns[genome_name] = codon_list_new
        
        for genome_name in all_genomes:
            outfile.write('>{}\n{}\n'.format(genome_name, ''.join(['-' for i in range(len(new_nt_seq))])))
            codon_alns[genome_name] = ['---' for i in range(len(codon_list_new))]
            
        if len(aln_pro_seq_list) >= 1 * len(list(nt_seq_dict.keys())): 
#             print('testing this one')
            hits = 0
            total_len = len(aln_pro_seq_list[0].seq)
            for i in range(total_len):
                idents = []
                for aa_align in aln_pro_seq_list:
                    idents.append(aa_align.seq[i])
#                 print(idents)
                if len(set(idents)) == 1 and idents[0] in ['P', 'T', 'A', 'V', 'G']:
#                 if idents[0] in ['P', 'T', 'A', 'V', 'G']:
                    for genome_name, codon_list in codon_alns.items():
                        redundant_concat_dict[genome_name] += codon_list[i][-1]
#                         print(codon_list[i])
                    hits += 1
#             print(hits, total_len)

### Making sure they're all the same length

In [None]:
for infile in glob.glob('../Data/Orthologs/Order_Enterobacterales_Mixed/mfastas_nt/*.mfasta')[:50]:
    print(infile)
    aln_nt_seq_list = list(SeqIO.parse(infile, format='fasta'))
    lens = []
    for i in aln_nt_seq_list:
        lens.append(len(i.seq))
    print(len(aln_nt_seq_list), set(lens))

In [None]:
for i, j in redundant_concat_dict.items():
    print(len(j))

In [None]:
with open('../Data/Tree_files/Order_Enterobacterales_Mixed/redundant_nts_align.fasta', 'w') as outfile:
    for key, val in redundant_concat_dict.items():
        outfile.write('>{}\n{}\n'.format(key, val[:]))

# Test that this alignment worked

In [None]:
# for infile in glob.glob('../Data/Orthologs/Order_Enterobacterales_Mixed/mfastas/*.mfasta')[:]:
#     aln_pro_seq_list = list(SeqIO.parse(infile, format='fasta'))
#     aln_nt_seq_list = list(SeqIO.parse(infile.replace('mfastas', 'mfastas_nt'), format='fasta'))
#     aln_new_nt_seq_list = [i.seq.split('---') for i in aln_nt_seq_list]
#     aln_new_pro_seq_list = [i.seq.split('-') for i in aln_pro_seq_list]
#     for i, pro_seqs in enumerate(aln_new_pro_seq_list):
#         for j, segment in enumerate(pro_seqs):
#             assert segment == aln_new_nt_seq_list[i][j].translate()

# Offline: Run phyloFit to get a neutral model
<br/><br/><br/><br/><br/><br/><br/><br/><br/><br/><br/>

# Offline: Run phylogenetic models on nucleotide data (phyloP, HyPhy,etc)

# Scratch

In [None]:

#         print('#########')
#         print(i.seq[:15])
#         print(aa_str[:15])
#         print(codon_list_new[:15])
#         print(codon_list[:15])

#         print(i.seq)
#         print(nt_seq_dict[genome_name][i.id].seq)
#         print(nt_seq_dict[genome_name][i.id].seq.translate())

#     aln_nt_seq_list = []
#     for i in aln_pro_seq_list:
#         nt_seq_of_interest = nt_seq_dict[i.description.split('|')[-1].strip()][i.id]
#         nt_seq_of_interest.seq.alphabet = IUPAC.IUPACUnambiguousDNA()
#         aln_nt_seq_list.append(nt_seq_of_interest)

In [None]:
>>> from Bio.Seq import Seq
>>> from Bio.Alphabet import IUPAC
>>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
>>> coding_dna.translate()

In [None]:
coding_dna

In [None]:
nt_seq_dict[genome][i.id].seq.translate()

# testing feature.extract and why it's so terrible

In [None]:
biopython_way = []
my_way = []
genbank_filename = ('../Data/GBFs/511145.12.PATRIC.gbf')
ignore_plasmid=True
for seq_record in SeqIO.parse(genbank_filename, "genbank"):
    if ignore_plasmid:
        if seq_record.description.lower().find('plasmid') != -1:
            continue
    for seq_feature in seq_record.features[:]: #Use slicing to truncate search for testing purposes
        if seq_feature.type=="CDS":
            strand = seq_feature.strand
#             print('### {}'.format(strand))
            nt_seq = seq_feature.extract(seq_record)
#             print('Bioython way:', str(nt_seq.seq)[:3], str(nt_seq.seq)[-3:], len(str(nt_seq.seq)))
            beg = seq_feature.location.start
            end = seq_feature.location.end
            if strand == 1:
                my_seq = seq_record.seq[beg:end]
            elif strand == -1:
                my_seq = seq_record.seq[beg:end].reverse_complement()
            else:
                print('catastrophic error')
#             print('My way:', str(my_seq)[:3], str(my_seq)[-3:], len(str(my_seq)))
            biopython_way.append(nt_seq.seq)
            my_way.append(my_seq)
        




            
            

In [None]:
len(biopython_way), len(my_way)

In [None]:
biopython_way == my_way

In [None]:
my_way[0][:-3]

In [None]:
genomes_dict = {}
for i, a in list(enumerate(df.index))[:]:
    if os.path.exists('../Data/GBFs/{}.PATRIC.gbf'.format(a)):
        genomes_dict[a] = list(SeqIO.parse('../Data/GBFs/{}.PATRIC.gbf'.format(a), 'genbank'))[0]
print('done loading into memory')
listy = list(genomes_dict.items())
for i, a in enumerate(listy):
    print(a[0])
    for j, b in list(enumerate(listy))[i+1:]:
        if str(a[1].seq) == str(b[1].seq):
            print('well fuck me sideways', a[0], b[0])       

In [31]:
df_all = pd.read_csv('../../Genome_database/Data/Dataframes/PATRIC_genome_complete_bacteria.csv', index_col='Genome ID')
print(len(df_all.index))
df_all = df_all[~df_all.index.duplicated(keep='first')]
print(len(df_all.index))

8178
8175


In [32]:
ecs = []
for index in df_all.index:
    if df_all.loc[index]['Genome Name'].find('Escherichia coli') != -1:
        ecs.append(df_all.loc[index]['Genome Name'])

In [17]:
genome_grouping = 'representative'
df_rep = pd.read_csv('../../Genome_database/Data/Dataframes/PATRIC_genome_complete_bacteria_{}_taxon.csv'.format(genome_grouping), index_col = 'Genome ID')
df_rep = df_rep[(df_rep['species']=='Escherichia coli')  & (df_rep['PATRIC CDS'] > 2000)]
print(len(df_rep.index))
genome_grouping = 'reference'
df_ref = pd.read_csv('../../Genome_database/Data/Dataframes/PATRIC_genome_complete_bacteria_{}_taxon.csv'.format(genome_grouping), index_col = 'Genome ID')
df_ref = df_ref[(df_ref['species']=='Escherichia coli')  & (df_ref['PATRIC CDS'] > 2000)]
print(len(df_ref.index))
df = pd.concat([df_rep, df_ref])
print(len(df.index), len(list(set(list(df.index)))))

0
6
6 6


In [16]:
for index in df_ref.index:
    if df_ref.loc[index]['Genome Name'].find('Escherichia coli') != -1:
#         print(df_ref.loc[index])
        print(index)

1133852.3
386585.9
511145.12
585056.7
585057.6
685038.3
