In [79]:
import pandas as pd
import re
import random
import requests
from time import sleep
import itertools
import io
from Bio.Nexus import Nexus
from Bio import SeqIO
from tqdm.notebook import tqdm
from pathlib import Path

In [4]:
annotation_data = pd.read_csv('recognized_table.csv', header=None)

In [7]:
annotation_data.shape

(77, 8)

In [21]:
annotation_data[0].value_counts()

Panaspis wahlbergi           15
Panaspis cabindae             8
Panaspis togoensis            6
Panaspis maculicollis         6
Panaspis sp. Mozambique 1     6
Panaspis sp. Katanga 2        5
Panaspis sp. Mozambique 4     5
Panaspis sp. Mozambique 3     4
Panaspis sp. Tanzania 2       2
Panaspis sp. Mozambique 2     2
Panaspis sp. Katanga 1        2
Panaspis sp. Malawi           2
Panaspis sp. Limpopo          2
Panaspis sp. Namibia          2
Panaspis sp. Tanzania 1       2
Tiliqua rugosa                1
Panaspis breviceps            1
Panaspis sp. Ethiopia         1
Broadleysaurus major          1
Xantusia vigilis              1
Plestiodon japonicus          1
Plestiodon inexpectatus       1
Panaspis sp. Mozambique 5     1
Name: 0, dtype: int64

In [8]:
species_index = dict()
for species in annotation_data[0].unique():
    species_index[species] = annotation_data.index[annotation_data[0] == species].tolist()

In [13]:
subset = list(itertools.chain(*(random.sample(value, max(len(value) // 3, 1)) for value in species_index.values())))

In [16]:
annotation_data_subset = annotation_data.loc[subset]

In [17]:
annotation_data_subset.shape

(31, 8)

In [18]:
annotation_data_subset[0].value_counts()

Panaspis wahlbergi           5
Panaspis cabindae            2
Panaspis maculicollis        2
Panaspis sp. Mozambique 1    2
Panaspis togoensis           2
Panaspis sp. Mozambique 3    1
Panaspis sp. Mozambique 4    1
Panaspis sp. Namibia         1
Panaspis sp. Tanzania 1      1
Panaspis breviceps           1
Panaspis sp. Ethiopia        1
Tiliqua rugosa               1
Panaspis sp. Limpopo         1
Panaspis sp. Malawi          1
Broadleysaurus major         1
Panaspis sp. Katanga 1       1
Panaspis sp. Mozambique 2    1
Xantusia vigilis             1
Plestiodon japonicus         1
Panaspis sp. Tanzania 2      1
Panaspis sp. Katanga 2       1
Plestiodon inexpectatus      1
Panaspis sp. Mozambique 5    1
Name: 0, dtype: int64

In [65]:
def retrieve_sequence_by_id(nucleotide_db_id):
    sleep(0.34)
    r = requests.get(
        'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
        f'db=nuccore&id={nucleotide_db_id}&rettype=gb'
    )
    r.raise_for_status()
    result_seqs = list(SeqIO.parse(io.StringIO(r.text), 'gb'))
    assert len(result_seqs) == 1
    return result_seqs[0]

In [70]:
sequences_data = dict()
for sample_index, sample_row in tqdm(annotation_data_subset.iterrows()):
    sample_data = dict()
    for key, db_id in zip(['16S', 'cyt b', 'PDC', 'RAG1'], sample_row[4:]):
        if db_id != '—':
            sample_data[key] = retrieve_sequence_by_id(re.sub(' +', '', db_id))
    sequences_data[sample_index] = sample_data

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [75]:
for gene in ['16S', 'cyt b', 'PDC', 'RAG1']:
    sequence_subset = {key: value[gene] for key, value in sequences_data.items() if gene in value.keys()}
    result_fasta_data = list()
    for key, value in sequence_subset.items():
        value.id = str(key)
        result_fasta_data.append(value)
    with open(f"{re.sub(' ', '_', gene)}.fasta", 'w') as output_d:
        SeqIO.write(result_fasta_data, output_d, 'fasta')

used [clustal omega](https://www.ebi.ac.uk/Tools/msa/clustalo/) for alignment in nexus format

merge .nex files for MrBayes

In [80]:
nexi = [(str(file), Nexus.Nexus(file)) for file in Path('.').glob('*.nex')]
combined = Nexus.combine(nexi)
with open('merged.nex', 'w') as output_d:
    combined.write_nexus_data(filename=output_d)

![](mr_bayes_params.png)

In [92]:
final_tree = """
    /- 3 (1)
   |                                                                               
   | 7 (18)
   |                                                                               
   |            /- 48 (2)
   |          /-+                                                                  
   |          | \-- 49 (26)
   |         /+                                                                    
   |         |\--- 44 (30)
   |         |                                                                     
   |         |/--- 16 (3)
   |         |+                                                                    
   |         |\-- 22 (5)
   |         |                                                                     
   |        /+  /- 62 (6)
   |        ||  |                                                                  
   |        ||  |- 65 (10)
   |        ||/-+                                                                  
   |        ||| |/- 69 (25)
   |        ||| \+                                                                 
   |        |\+  \ 71 (27)
   |       /+ |                                                                    
   +       || \--- 60 (7)
   |       ||                                                                      
   |       || /-- 40 (4)
   |       || |                                                                    
   |      /+| |-- 37 (28)
   |      ||\-+                                                                    
   |      ||  \ 26 (31)
   |      ||                                                                       
   |      |\--- 33 (29)
   |      |                                                                        
   |      |    /--- 45 (8)
   |    /-+  /-+                                                                   
   |    | |  | \-- 23 (9)
   |    | |  |                                                                     
   |    | |/-+   /- 13 (11)
   |    | || |  /+                                                                 
   |    | || |  |\- 11 (12)
   |    | || \--+                                                                  
   |    | \+    | / 29 (23)
   |    |  |    \-+                                                                
   |    |  |      \ 30 (24)
   |    |  |                                                                       
   \----+  \------- 15 (22)
        |                                                                          
        |                       /----------------------- 72 (13)
        |                       |                                                  
        |                    /--+   /----- 74 (14)
        |                    |  \---+                                              
        |                /---+      \--- 75 (15)
        |                |   |                                                     
        |   /------------+   \--------------------------------------------- 73 (16)
        |   |            |                                                         
        |   |            \-------- 76 (17)
        \---+                                                                      
            | /------- 0 (19)
            | |                                                                    
            \-+   /- 53 (20)
              \---+                                                                
                  \-- 51 (21)
"""

In [94]:
for index, species in annotation_data_subset[0].items():
    final_tree = re.sub(rf" {str(index)} ", f" {species} ", final_tree)

In [103]:
with open('mr_bayes_tree.txt', 'w') as output_d:
    output_d.write(final_tree)

In [95]:
print(final_tree)


    /- Panaspis cabindae (1)
   |                                                                               
   | Panaspis cabindae (18)
   |                                                                               
   |            /- Panaspis sp. Tanzania 1 (2)
   |          /-+                                                                  
   |          | \-- Panaspis sp. Tanzania 2 (26)
   |         /+                                                                    
   |         |\--- Panaspis sp. Mozambique 5 (30)
   |         |                                                                     
   |         |/--- Panaspis sp. Katanga 1 (3)
   |         |+                                                                    
   |         |\-- Panaspis sp. Katanga 2 (5)
   |         |                                                                     
   |        /+  /- Panaspis wahlbergi (6)
   |        ||  |                                                          

In [108]:
phy_ml_tree = """
+-------------------------------------------------------------------------------------------------n73
 |
 |                                                         +---------------------------------------------------------n72
 |                                                         |
 |                                                         |     +-------n76
 |                                                         |-----+
 |                                                         |     +----------------------------n74
 |                                                         |
 |                                                         |                          +-------n0
 |                                                         |                          |
 |                                                         |                      +---+
 |                                                         |                      |   |  +n53
 +---------------------------------------------------------+                      |   +--+
                                                           |                      |      +n51
                                                           |                      |
                                                           |                      |           +----n15
                                                           |                      |           |
                                                           |                      |           |        +n30
                                                           |                      |           |      +-+
                                                           |                      |        +--+      | +n29
                                                           +----------------------+        |  |  +---+
                                                                                  |        |  |  |   |  +-n11
                                                                                  |        |  |  |   +--+
                                                                                  |        |  +--+      +-n13
                                                                                  |        |     |
                                                                                  |        |     |  +--n45
                                                                                  |        |     +--+
                                                                                  |        |        +-n23
                                                                                  |        |
                                                                                  |        | +-----n33
                                                                                  +--------+ |
                                                                                           |++
                                                                                           |||      +n7
                                                                                           ||+------+
                                                                                           ||       +n3
                                                                                           ||
                                                                                           ||   +-n49
                                                                                           || +-+
                                                                                           || | +n48
                                                                                           || |
                                                                                           ++ | +--n44
                                                                                            | |++
                                                                                            | ||+-----n16
                                                                                            | |+
                                                                                            | |+---n22
                                                                                            | |
                                                                                            | |  +n26
                                                                                            | |  |
                                                                                            | |--+------n40
                                                                                            +-+  |
                                                                                              |  +--n37
                                                                                              |
                                                                                              |
                                                                                              | +n71
                                                                                              |++
                                                                                              ||+--n69
                                                                                              ||
                                                                                              ||-----n60
                                                                                              ++
                                                                                               |-n62
                                                                                               |
                                                                                               +n65
"""

In [114]:
for index, species in annotation_data_subset[0].items():
    phy_ml_tree = re.sub(rf"n{str(index)}\n", f"{species} ", phy_ml_tree)

In [115]:
with open('phy_ml_tree.txt', 'w') as output_d:
    output_d.write(phy_ml_tree)

In [116]:
print(phy_ml_tree)


+-------------------------------------------------------------------------------------------------Xantusia vigilis  |
 |                                                         +---------------------------------------------------------Broadleysaurus major  |                                                         |
 |                                                         |     +-------Tiliqua rugosa  |                                                         |-----+
 |                                                         |     +----------------------------Plestiodon inexpectatus  |                                                         |
 |                                                         |                          +-------Panaspis breviceps  |                                                         |                          |
 |                                                         |                      +---+
 |                                                         