# Notebook for EAGLE.lib modules development
This notebook should be run from test environvent (not development)

In [32]:
# imports
import os
from collections import defaultdict

import numpy
from scipy.stats import chisquare

from EAGLE.lib.seqs import load_fasta_to_dict

## Conservative columns irregularity

In [120]:
# constants
CONS_THR = 0.98

# input
workdir = "/media/denis/Data/Data/Bioinf/Projects/Reverse_ORFs/alignments/DnaK/"
dnaK_AORFs_aln_fasta = os.path.join(workdir, "dnaK_AORFs_aln.fasta")
PF00208_aln_fasta = os.path.join(workdir, "PF00208_seed_aln.fasta")
PF05088_aln_fasta = os.path.join(workdir, "PF05088_seed_aln.fasta")

In [121]:
def estimate_irregularity(mult_aln_dict, cons_thr=CONS_THR, window_l=150, windows_step=75):  # This function will be the MultAln classmethod
    # mult_aln_dict will be replaced with self.mult_aln_dict
    windows_list = list()
    i = 0
    while i < (len(mult_aln_dict[mult_aln_dict.keys()[0]])):
        windows_list.append(dict((seq_id, mult_aln_dict[seq_id][i: i+window_l]) for seq_id in mult_aln_dict))
        i += windows_step
    cons_cols_by_windows = numpy.array([cons_cols_num(w, cons_thr=cons_thr) for w in windows_list])
    print(cons_cols_by_windows, cons_cols_by_windows.mean())
    return chisquare(cons_cols_by_windows)


def cons_cols_num(mult_aln_dict, cons_thr=CONS_THR):
    cln = 0
    for i in range(len(mult_aln_dict[mult_aln_dict.keys()[0]])):
        s_num_dict = defaultdict(int)
        for seq_id in mult_aln_dict:
            s_num_dict[mult_aln_dict[seq_id][i].lower()] += 1
        all_s_num = sum(s_num_dict.values())
        if float(s_num_dict.get("-", 0))/float(all_s_num) <= 1.0-cons_thr:
            if float(sorted(s_num_dict.values(), reverse=True)[0])/float(all_s_num) >= cons_thr:
                cln += 1
    return cln


def rarefy(mult_aln_dict, seqs_to_remain=100):
    seqs_ids = mult_aln_dict.keys()
    if len(seqs_ids) <= seqs_to_remain:
        return mult_aln_dict
    rarefyed_aln_dict = dict()
    for i in range(seqs_to_remain):
        seq_id = None
        seq_id = seqs_ids.pop(numpy.random.randint(len(seqs_ids)))
        rarefyed_aln_dict[seq_id] = mult_aln_dict[seq_id]
    return rarefyed_aln_dict

In [122]:
dnaK_AORFs_aln_dict = rarefy(load_fasta_to_dict(dnaK_AORFs_aln_fasta))
PF00208_aln_dict = load_fasta_to_dict(PF00208_aln_fasta)
PF05088_aln_dict = load_fasta_to_dict(PF05088_aln_fasta)

print(estimate_irregularity(mult_aln_dict=dnaK_AORFs_aln_dict, window_l=50, windows_step=25))  # 150; 75
print(estimate_irregularity(mult_aln_dict=PF00208_aln_dict, window_l=50, windows_step=25))  # 60; 30
print(estimate_irregularity(mult_aln_dict=PF05088_aln_dict, window_l=50, windows_step=25))  # 300; 150

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 3, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]), 0.45714285714285713)
Power_divergenceResult(statistic=45.25, pvalue=0.09401599631159645)
(array([4, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0]), 0.9333333333333333)
Power_divergenceResult(statistic=24.57142857142857, pvalue=0.03904172878196133)
(array([ 0,  0,  0,  0,  0,  1,  1,  0,  0,  1,  1,  0,  1,  2,  2,  1,  0,
        0,  0,  0,  0,  1,  1,  1,  1,  0,  1,  1,  0,  3,  5,  3,  1,  0,
        0,  3,  3,  6, 19, 27, 18,  8, 10, 18, 29, 19, 15, 20, 13,  6,  5,
        7, 10, 15, 12,  8,  3,  2,  3,  2,  4,  5,  2,  0,  0,  1,  1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), 4.128205128205129)
Power_divergenceResult(statistic=824.260869565217, pvalue=1.555762467305417e-125)


In [112]:
print("Number of sequences: %s; alignment length %s" % (len(dnaK_AORFs_aln_dict), len(dnaK_AORFs_aln_dict[dnaK_AORFs_aln_dict.keys()[0]])))
print("Number of sequences: %s; alignment length %s" % (len(PF00208_aln_dict), len(PF00208_aln_dict[PF00208_aln_dict.keys()[0]])))
print("Number of sequences: %s; alignment length %s" % (len(PF05088_aln_dict), len(PF05088_aln_dict[PF05088_aln_dict.keys()[0]])))

Number of sequences: 100; alignment length 859
Number of sequences: 97; alignment length 353
Number of sequences: 113; alignment length 1934
