# Notebook for EAGLE.lib modules development
This notebook should be run from test environvent (not development)

In [1]:
# imports
import os
from collections import defaultdict

import numpy
import pandas
from scipy.stats import chisquare

from EAGLE.lib.seqs import load_fasta_to_dict
from EAGLE.lib.alignment import MultAln

## Conservative columns uniformity

In [56]:
# constants
CONS_THR = 1.0

# input
workdir = "/media/denis/Data/Data/Bioinf/Projects/Reverse_ORFs/alignments/DnaK/"
dnaK_AORFs_aln_fasta = os.path.join(workdir, "dnaK_AORFs_aln.fasta")
PF00208_aln_fasta = os.path.join(workdir, "PF00208_seed_aln.fasta")
PF05088_aln_fasta = os.path.join(workdir, "PF05088_seed_aln.fasta")

In [57]:
dnaK_AORFs_aln = MultAln.load_alignment(dnaK_AORFs_aln_fasta).rarefy()
PF00208_aln = MultAln.load_alignment(PF00208_aln_fasta)
PF05088_aln = MultAln.load_alignment(PF05088_aln_fasta)

print(dnaK_AORFs_aln.estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5))  
print(PF00208_aln.estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5)) 
print(PF05088_aln.estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5))  

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.09302325581395349)
Power_divergenceResult(statistic=156.0, pvalue=0.7880827853544203)
(array([2, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0]), 0.16901408450704225)
Power_divergenceResult(statistic=106.33333333333329, pvalue=0.0033364225

In [58]:
dnaK_AORFs_aln = MultAln.load_alignment(dnaK_AORFs_aln_fasta).rarefy()
PF00208_aln = MultAln.load_alignment(PF00208_aln_fasta)
PF05088_aln = MultAln.load_alignment(PF05088_aln_fasta)

print(dnaK_AORFs_aln.improve_aln().estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5))
print(PF00208_aln.improve_aln().estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5))
print(PF05088_aln.improve_aln().estimate_uniformity(cons_thr=CONS_THR, window_l=10, windows_step=5))

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.05847953216374269)
Power_divergenceResult(statistic=161.00000000000003, pvalue=0.6774772533033233)
(array([1, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0]), 0.15492957746478872)
Power_divergenceResult(statistic=111.6363636363636, pvalue=0.0

In [112]:
print("Number of sequences: %s; alignment length %s" % (len(dnaK_AORFs_aln_dict), len(dnaK_AORFs_aln_dict[dnaK_AORFs_aln_dict.keys()[0]])))
print("Number of sequences: %s; alignment length %s" % (len(PF00208_aln_dict), len(PF00208_aln_dict[PF00208_aln_dict.keys()[0]])))
print("Number of sequences: %s; alignment length %s" % (len(PF05088_aln_dict), len(PF05088_aln_dict[PF05088_aln_dict.keys()[0]])))

Number of sequences: 100; alignment length 859
Number of sequences: 97; alignment length 353
Number of sequences: 113; alignment length 1934


In [59]:
res = [
    {"cons": "80%", "dnaK_AORFs_P": 3.3e-12, "PF00208_P": 1.86e-05, "PF05088_P": 0},  # 8.94e-71
    {"cons": "90%", "dnaK_AORFs_P": 0.00019, "PF00208_P": 4.54e-05, "PF05088_P": 0},  # 3.4e-94
    {"cons": "95%", "dnaK_AORFs_P": 0.00707, "PF00208_P": 3.64e-05, "PF05088_P": 0},  # 9.01e-104
    {"cons": "98%", "dnaK_AORFs_P": 0.5268, "PF00208_P": 8.33e-05, "PF05088_P": 0},  # 5.32e-102
    {"cons": "100%", "dnaK_AORFs_P": 0.67748, "PF00208_P": 0.00115, "PF05088_P": 0},  # 1.28e-95
]
pandas.DataFrame(res)

Unnamed: 0,PF00208_P,PF05088_P,cons,dnaK_AORFs_P
0,1.9e-05,0,80%,3.3e-12
1,4.5e-05,0,90%,0.00019
2,3.6e-05,0,95%,0.00707
3,8.3e-05,0,98%,0.5268
4,0.00115,0,100%,0.67748


In [5]:
f = open("../tests/test_data/lib/f1.txt")
f.readline()

"Hello! I'm a test file\n"

In [19]:
f.

''