# Notebook for EAGLE.lib modules development
This notebook should be run from test environvent (not development)

In [1]:
# imports
import os
from collections import defaultdict

import numpy
import pandas
from scipy.stats import chisquare

from EAGLE.lib.seqs import load_fasta_to_dict
from EAGLE.lib.alignment import MultAln

## Conservative columns uniformity

In [2]:
# constants
CONS_THR = 1.0

# input
workdir = "/media/denis/Data/Data/Bioinf/Projects/Reverse_ORFs/alignments/DnaK/"
dnaK_AORFs_aln_fasta = os.path.join(workdir, "dnaK_AORFs_aln.fasta")
PF00208_aln_fasta = os.path.join(workdir, "PF00208_seed_aln.fasta")
PF05088_aln_fasta = os.path.join(workdir, "PF05088_seed_aln.fasta")

In [16]:
def estimate_irregularity(mult_aln_dict, cons_thr=CONS_THR, window_l=150, windows_step=75):  # This function will be the MultAln classmethod
    # mult_aln_dict will be replaced with self.mult_aln_dict
    windows_list = list()
    i = 0
    while i < (len(mult_aln_dict[mult_aln_dict.keys()[0]])):
        windows_list.append(dict((seq_id, mult_aln_dict[seq_id][i: i+window_l]) for seq_id in mult_aln_dict))
        i += windows_step
    cons_cols_by_windows = numpy.array([cons_cols_num(w, cons_thr=cons_thr) for w in windows_list])
    print(cons_cols_by_windows, cons_cols_by_windows.mean())
    return chisquare(cons_cols_by_windows)


def cons_cols_num(mult_aln_dict, cons_thr=CONS_THR):
    cln = 0
    for i in range(len(mult_aln_dict[mult_aln_dict.keys()[0]])):
        s_num_dict = defaultdict(int)
        for seq_id in mult_aln_dict:
            s_num_dict[mult_aln_dict[seq_id][i].lower()] += 1
        all_s_num = sum(s_num_dict.values())
        if float(s_num_dict.get("-", 0))/float(all_s_num) <= 1.0-cons_thr:
            if float(sorted(s_num_dict.values(), reverse=True)[0])/float(all_s_num) >= cons_thr:
                cln += 1
    return cln


def rarefy(mult_aln_dict, seqs_to_remain=100):
    seqs_ids = mult_aln_dict.keys()
    if len(seqs_ids) <= seqs_to_remain:
        return mult_aln_dict
    rarefyed_aln_dict = dict()
    for i in range(seqs_to_remain):
        seq_id = None
        seq_id = seqs_ids.pop(numpy.random.randint(len(seqs_ids)))
        rarefyed_aln_dict[seq_id] = mult_aln_dict[seq_id]
    return rarefyed_aln_dict

In [4]:
#dnaK_AORFs_aln_dict = MultAln(MultAln.load_alignment(dnaK_AORFs_aln_fasta).rarefy())
PF00208_aln_dict = MultAln.load_alignment(PF00208_aln_fasta)
PF05088_aln_dict = MultAln.load_alignment(PF05088_aln_fasta)

print(estimate_irregularity(mult_aln_dict=dnaK_AORFs_aln_dict, window_l=50, windows_step=25))  # 150; 75
print(estimate_irregularity(mult_aln_dict=PF00208_aln_dict, window_l=50, windows_step=25))  # 60; 30
print(estimate_irregularity(mult_aln_dict=PF05088_aln_dict, window_l=50, windows_step=25))  # 300; 150

TypeError: coercing to Unicode: need string or buffer, dict found

In [112]:
print("Number of sequences: %s; alignment length %s" % (len(dnaK_AORFs_aln_dict), len(dnaK_AORFs_aln_dict[dnaK_AORFs_aln_dict.keys()[0]])))
print("Number of sequences: %s; alignment length %s" % (len(PF00208_aln_dict), len(PF00208_aln_dict[PF00208_aln_dict.keys()[0]])))
print("Number of sequences: %s; alignment length %s" % (len(PF05088_aln_dict), len(PF05088_aln_dict[PF05088_aln_dict.keys()[0]])))

Number of sequences: 100; alignment length 859
Number of sequences: 97; alignment length 353
Number of sequences: 113; alignment length 1934


In [18]:
res = [
    {"cons": "80%", "dnaK_AORFs_P": 0, "PF00208_P": 0.0001, "PF05088_P": 0},  # 2.8e-102
    {"cons": "90%", "dnaK_AORFs_P": 2.07e-10, "PF00208_P": 0.0061, "PF05088_P": 0},  # 1.22e-116
    {"cons": "95%", "dnaK_AORFs_P": 0.00036, "PF00208_P": 0.0163, "PF05088_P": 0},  # 7.04e-124
    {"cons": "98%", "dnaK_AORFs_P": 0.00025, "PF00208_P": 0.039, "PF05088_P": 0},  # 1.56e-125
    {"cons": "100%", "dnaK_AORFs_P": 0.436, "PF00208_P": 0.02994, "PF05088_P": 0},  # 2.54e-116
]
pandas.DataFrame(res)

Unnamed: 0,PF00208_P,PF05088_P,cons,dnaK_AORFs_P
0,0.0001,0,80%,0.0
1,0.0061,0,90%,2.07e-10
2,0.0163,0,95%,0.00036
3,0.039,0,98%,0.00025
4,0.02994,0,100%,0.436


In [5]:
f = open("../tests/test_data/lib/f1.txt")
f.readline()

"Hello! I'm a test file\n"

In [19]:
f.

''