# Notebook for EAGLE.lib modules development
This notebook should be run from test environvent (not development)

In [28]:
# imports
import os
import io
from collections import defaultdict

import numpy
import pandas
from scipy.stats import chisquare
from Bio import Phylo

from EAGLE.lib.seqs import load_fasta_to_dict
from EAGLE.lib.alignment import MultAln

## Conservative columns uniformity

In [2]:
# constants
CONS_THR = 1.0

# input
workdir = "/media/denis/Data/Data/Bioinf/Projects/Reverse_ORFs/alignments/DnaK/"
dnaK_AORFs_aln_fasta = os.path.join(workdir, "dnaK_AORFs_aln.fasta")
PF00208_aln_fasta = os.path.join(workdir, "PF00208_seed_aln.fasta")
PF05088_aln_fasta = os.path.join(workdir, "PF05088_seed_aln.fasta")

In [3]:
dnaK_AORFs_aln = MultAln.load_alignment(dnaK_AORFs_aln_fasta).rarefy(seqs_to_remain=100)
PF00208_aln = MultAln.load_alignment(PF00208_aln_fasta)
PF05088_aln = MultAln.load_alignment(PF05088_aln_fasta)

print(dnaK_AORFs_aln.estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5))  
print(PF00208_aln.estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5)) 
print(PF05088_aln.estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5))  

0.6769742940888603
0.0033364225866355087
1.7093471117856797e-102


In [4]:
dnaK_AORFs_aln = MultAln.load_alignment(dnaK_AORFs_aln_fasta).rarefy(seqs_to_remain=100)
PF00208_aln = MultAln.load_alignment(PF00208_aln_fasta)
PF05088_aln = MultAln.load_alignment(PF05088_aln_fasta)

print(dnaK_AORFs_aln.improve_aln().estimate_uniformity(cons_thr=CONS_THR, 
                                                       window_l=10, 
                                                       windows_step=5))
print(PF00208_aln.improve_aln().estimate_uniformity(cons_thr=CONS_THR, 
                                                    window_l=10, 
                                                    windows_step=5))
print(PF05088_aln.improve_aln().estimate_uniformity(cons_thr=CONS_THR, 
                                                    window_l=10, 
                                                    windows_step=5))

0.788807249484195
0.0011520096622918072
1.2808890086709018e-95


In [6]:
print("Number of sequences: %s; alignment length %s" % (dnaK_AORFs_aln.num_seqs, 
                                                        dnaK_AORFs_aln.length))
print("Number of sequences: %s; alignment length %s" % (PF00208_aln.num_seqs, 
                                                        PF00208_aln.length))
print("Number of sequences: %s; alignment length %s" % (PF05088_aln.num_seqs, 
                                                        PF05088_aln.length))

Number of sequences: 100; alignment length 859
Number of sequences: 97; alignment length 353
Number of sequences: 113; alignment length 1934


In [59]:
res = [
    {"cons": "80%", "dnaK_AORFs_P": 3.3e-12, "PF00208_P": 1.86e-05, "PF05088_P": 0},  # 8.94e-71
    {"cons": "90%", "dnaK_AORFs_P": 0.00019, "PF00208_P": 4.54e-05, "PF05088_P": 0},  # 3.4e-94
    {"cons": "95%", "dnaK_AORFs_P": 0.00707, "PF00208_P": 3.64e-05, "PF05088_P": 0},  # 9.01e-104
    {"cons": "98%", "dnaK_AORFs_P": 0.5268, "PF00208_P": 8.33e-05, "PF05088_P": 0},  # 5.32e-102
    {"cons": "100%", "dnaK_AORFs_P": 0.67748, "PF00208_P": 0.00115, "PF05088_P": 0},  # 1.28e-95
]
pandas.DataFrame(res)

Unnamed: 0,PF00208_P,PF05088_P,cons,dnaK_AORFs_P
0,1.9e-05,0,80%,3.3e-12
1,4.5e-05,0,90%,0.00019
2,3.6e-05,0,95%,0.00707
3,8.3e-05,0,98%,0.5268
4,0.00115,0,100%,0.67748


In [38]:
entero_data = {
    "Enterobacteriaceae": {
        "chr_id": {
            "NC_022534.1": ["Plautia", "Plautia_stali", "Plautia_stali_symbiont"], 
            "NC_022546.1": ["Plautia", "Plautia_stali", "Plautia_stali_symbiont"], 
            "NZ_CP011049.1": ["Cronobacter", "Cronobacter_sakazakii", "Cronobacter_sakazakii"], 
            "NZ_CP011048.1": ["Cronobacter", "Cronobacter_sakazakii", "Cronobacter_sakazakii"], 
            "NZ_CP019445.1": ["Kosakonia", "Kosakonia_cowanii", "Kosakonia_cowanii"], 
            "NZ_CP011047.1": ["Cronobacter", "Cronobacter_sakazakii", "Cronobacter_sakazakii"], 
            "NZ_CP011050.1": ["Cronobacter", "Cronobacter_sakazakii", "Cronobacter_sakazakii"], 
            "NZ_CP019446.1": ["Kosakonia", "Kosakonia_cowanii", "Kosakonia_cowanii"], 
            "NZ_CP009451.1": ["Cedecea", "Cedecea_neteri", "Cedecea_neteri"], 
            "LT594522.1": ["secondary", "secondary_endosymbiont", "secondary_endosymbiont_of_Trabutina_mannipara"], 
            "NC_022533.1": ["Plautia", "Plautia_stali", "Plautia_stali_symbiont"], 
            "CP009450.1": ["Pluralibacter", "Pluralibacter_gergoviae", "Pluralibacter_gergoviae"], 
            "NZ_CP019447.1": ["Kosakonia", "Kosakonia_cowanii", "Kosakonia_cowanii"], 
            "NC_017910.1": ["Shimwellia", "Shimwellia_blattae", "Shimwellia_blattae_DSM_4481_=_NBRC_105725"], 
            "CP007215.3": ["Kosakonia", "Kosakonia_sacchari", "Kosakonia_sacchari_SP1"]
        }, 
        "Plautia": {
            "Plautia_stali": {
                "Plautia_stali_symbiont": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/Plautia_stali_symbiont_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCF_000180175.2_ASM18017v2_genomic.fna", 
                    "download_prefix": "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/180/175/GCF_000180175.2_ASM18017v2/GCF_000180175.2_ASM18017v2", 
                    "source_db": "refseq", 
                    "repr": True
                }
            }
        },
        "Cedecea": {
            "Cedecea_neteri": {
                "Cedecea_neteri": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/Cedecea_neteri_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCF_000757825.1_ASM75782v1_genomic.fna", 
                    "download_prefix": "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/757/825/GCF_000757825.1_ASM75782v1/GCF_000757825.1_ASM75782v1", 
                    "source_db": "refseq", 
                    "repr": True
                }
            }
        },
        "Kosakonia": {
            "Kosakonia_sacchari": {
                "Kosakonia_sacchari_SP1": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/Kosakonia_sacchari_SP1_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCA_000300455.4_ASM30045v4_genomic.fna", 
                    "download_prefix": "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/300/455/GCA_000300455.4_ASM30045v4/GCA_000300455.4_ASM30045v4", 
                    "source_db": "genbank", 
                    "repr": True
                }
            }, 
            "Kosakonia_cowanii": {
                "Kosakonia_cowanii": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/Kosakonia_cowanii_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCF_001975225.1_ASM197522v1_genomic.fna", 
                    "download_prefix": "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/975/225/GCF_001975225.1_ASM197522v1/GCF_001975225.1_ASM197522v1", 
                    "source_db": "refseq", 
                    "repr": True
                }
            }
        },
        "Pluralibacter": {
            "Pluralibacter_gergoviae": {
                "Pluralibacter_gergoviae": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/Pluralibacter_gergoviae_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCA_000757785.1_ASM75778v1_genomic.fna", 
                    "download_prefix": 
                    "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/757/785/GCA_000757785.1_ASM75778v1/GCA_000757785.1_ASM75778v1", 
                    "source_db": "genbank", 
                    "repr": True
                }
            }
        }, 
        "Shimwellia": {
            "Shimwellia_blattae": {
                "Shimwellia_blattae_DSM_4481_=_NBRC_105725": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/Shimwellia_blattae_DSM_4481_=_NBRC_105725_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCF_000262305.1_ASM26230v1_genomic.fna", 
                    "download_prefix": "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/262/305/GCF_000262305.1_ASM26230v1/GCF_000262305.1_ASM26230v1", 
                    "source_db": "refseq", 
                    "repr": True
                }
            }
        }, 
        "secondary": {
            "secondary_endosymbiont": {
                "secondary_endosymbiont_of_Trabutina_mannipara": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/secondary_endosymbiont_of_Trabutina_mannipara_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCA_900090215.1_TRABTM_genomic.fna", 
                    "download_prefix": "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/090/215/GCA_900090215.1_TRABTM/GCA_900090215.1_TRABTM", 
                    "source_db": "genbank", 
                    "repr": True
                }
            }
        },
        "Cronobacter": {
            "Cronobacter_sakazakii": {
                "Cronobacter_sakazakii": {
                    "16S_rRNA_file": "EAGLEdb/bacteria/Cronobacter_sakazakii_16S_rRNA.fasta", 
                    "fna_file": "EAGLEdb/bacteria/GCF_000982825.1_ASM98282v1_genomic.fna", 
                    "download_prefix": "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/982/825/GCF_000982825.1_ASM98282v1/GCF_000982825.1_ASM98282v1", 
                    "source_db": "refseq", 
                    "repr": True
                }
            }
        },
        "repr_profile": "EAGLEdb/bacteria/Enterobacteriaceae.hmm", 
        "16S_rRNA_tsv": "EAGLEdb/bacteria/Enterobacteriaceae_16S_rRNA.tsv", 
        "fam_fna": "EAGLEdb/bacteria/Enterobacteriaceae.fasta", 
        "16S_rRNA_fasta": "EAGLEdb/bacteria/Enterobacteriaceae_16S_rRNA.fasta", 
        "blastdb": "EAGLEdb/bacteria/Enterobacteriaceae_blastdb/Enterobacteriaceae", 
        "16S_rRNA_tree": {
            "full_seq_names": {
                "Plsta__x_0": {"organism_name": "Plautia_stali_symbiont"}, 
                "Crsak__x_3": {"organism_name": "Cronobacter_sakazakii"}, 
                "Shbla__x_1": {"organism_name": "Shimwellia_blattae_DSM_4481_=_NBRC_105725"}, 
                "Plger__x_5": {"organism_name": "Pluralibacter_gergoviae"}, 
                "seend__x__": {"organism_name": "secondary_endosymbiont_of_Trabutina_mannipara"}, 
                "Kocow__x_1": {"organism_name": "Kosakonia_cowanii"}, 
                "Kosac__x_4": {"organism_name": "Kosakonia_sacchari_SP1"}, 
                "Cenet__x_1": {"organism_name": "Cedecea_neteri"}
            }, 
            "newick": "((((((seend__x__:0.089128,(((((((Cenet__x_2:-0.000001,(Cenet__x_3:0.000000,Cenet__x_4:-0.000000):0.000001):0.000003,Cenet__x_0:-0.000003):0.000019,Cenet__x_5:-0.000019):0.000059,Cenet__x__:-0.000059):0.000214,Cenet__x_1:-0.000214):0.016438,(((((((Kosac__x_2:-0.000000,(Kosac__x_0:0.000000,Kosac__x_1:-0.000000):0.000000):0.000002,Kosac__x_5:-0.000002):0.000019,Kosac__x__:-0.000019):0.000025,Kosac__x_3:-0.000025):0.000079,Kosac__x_4:-0.000079):0.008846,((Plger__x__:-0.000011,((Plger__x_4:-0.000001,(Plger__x_5:0.000000,(Plger__x_2:-0.000000,Plger__x_1:0.000000):-0.000000):0.000001):0.000001,Plger__x_0:-0.000001):0.000011):0.000063,Plger__x_3:-0.000063):0.010302):0.002742,((((Kocow__x_4:-0.000005,(Kocow__x_2:-0.000004,(Kocow__x_5:-0.000004,Kocow__x_3:0.000004):0.000004):0.000005):0.000009,Kocow__x__:-0.000009):0.000019,Kocow__x_0:-0.000019):0.000070,Kocow__x_1:-0.000070):0.013207):0.001487):0.002120,((Shbla__x_1:-0.000518,(((Shbla__x_2:-0.000015,Shbla__x_4:0.000015):0.000054,Shbla__x__:-0.000054):0.000023,(((Shbla__x_0:0.000034,Shbla__x_3:-0.000034):0.000037,Shbla__x_6:-0.000037):0.000047,Shbla__x_5:-0.000047):0.000046):0.000483):0.016279,((((((Crsak__x__:0.000001,Crsak__x_2:-0.000001):0.000002,Crsak__x_5:-0.000002):0.000002,Crsak__x_0:-0.000002):0.000012,Crsak__x_4:-0.000012):0.000042,Crsak__x_1:-0.000042):0.000435,Crsak__x_3:-0.000435):0.012426):0.005990):0.001677):0.018294,Plsta__x_0:-0.000937):0.000937,Plsta__x_4:-0.000016):0.000016,Plsta__x_2:-0.000011):0.000011,Plsta__x_3:-0.000009):0.000009,Plsta__x_1:-0.000007,Plsta__x__:0.000007);"
        } 
    }
}

btax_data = entero_data["Enterobacteriaceae"]

In [49]:
tree = Phylo.read("../../../entero_tree.nwk", "newick")

<bound method Tree.get_nonterminals of Tree(rooted=False, weight=1.0)>

In [20]:
class C(object):
    
    def __init__(self, d):
        self.d = d
    
    def __getitem__(self, d_id):
        return self.d[d_id]
    
    def __setitem__(self, d_id, d_data):
        self.d[d_id] = d_data
    
    def __delitem__(self, d_id):
        del self.d[d_id]

c = C({"A": 1, "B": 2})
c["C"] = 3
print(c["A"])
del c["A"]
print(c.d)

1
{'C': 3, 'B': 2}


In [50]:
l = [1,2,3,4]
filter(lambda x: x > 3, l)

[4]