In [1]:
import os
from time import time
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.signal import find_peaks

from kmerlib.running_window import *
from kmerlib.spectrum import *
from kmerlib.tools import *
from utils.term_colors import *

import json

# plot config
sns.set()

In [2]:
bacteria_list = [
    "Alcaligenes_aquatilis",
    "Aeromonas_australiensis",
    "Campylobacter_jejuni",
    "Caulobacter_flavus",
    "Cyanobacterium_aponinum",
    "Bacillus_cereus",
    "Bacillus_pseudofirmus",
    "Acidobacterium_capsulatum",
    "Leptospira_interrogans",
    "BRC1_bacterium_SCGC_AAA252-M09",
    "Actinomyces_bovis",
    "Gemmatimonas_aurantiaca",
    "Bacteroides_fragilis",
    "Chloroflexus_aurantiacus",
    "Thermotoga_maritima",
    "Aquifex_aeolicus",
    "Gemmata_obscuriglobus",
    "Planctomycetes_bacterium_SCGC_AAA282-C19"
]
archaea_list = [
    "Halovivax_asiaticus",
    "Methanobrevibacter_ruminantium",
    "Methanococcus_aeolicus",
    "Methanococcus_maripaludis",
    "Methanococcus_vannielii",
    "Methanopyrus_kandleri",
    "Thermococcus_gammatolerans",
    "Acidianus_brierleyi",
    "Sulfolobus_acidocaldarius",
    "Staphylothermus_marinus",
    "Thermofilum_pendens",
    "Pyrobaculum_aerophilum",
    "Pyrobaculum_arsenaticum"
]

In [3]:
# COMPUTE RAW DATA, USE IT WITH CPYTHON
data = list()

for b in bacteria_list:
    s = dict()
    s['type'] = 'bacteria'
    s['name'] = b
    print("> " +FG_RED+ b +" "+FG_BLUE+ "bacteria" +END_COLOR)
    
    path = Path("./data/bacteria") / b
    files = os.listdir(str(path))
    fna_files = [f for f in files if ".fna" in f]
    if len(fna_files) < 1:
        print("PATH:", path)
        raise Exception("Several .fna files or no .fna file")
    path = path / fna_files[0]
    s['seq'] = load_seq_file(str(path))
    data.append(s)

for b in archaea_list:
    s = dict()
    s['type'] = 'archaea'
    s['name'] = b
    print("> " +FG_RED+ b +" "+FG_BLUE+ "archaea" +END_COLOR)
    
    path = Path("./data/archaea") / b
    files = os.listdir(str(path))
    fna_files = [f for f in files if ".fna" in f]
    if len(fna_files) < 1:
        raise Exception("Several .fna files or no .fna file")
    path = path / fna_files[0]
    s['seq'] = load_seq_file(str(path))
    data.append(s)

with open("./data/tree_raw_data.json", "w") as f:
    json.dump(data, f)

> [31mAlcaligenes_aquatilis [34mbacteria[0m


KeyboardInterrupt: 

In [2]:
# LOAD DATA
with open("./data/tree_raw_data.json", "r") as f:
    data = json.load(f)

In [8]:
k = 3
win_len = 200
step = 20
n_process = 3
th = 1

sep = " "+FG_GRAY+"|"+END_COLOR+" "

for s in data:
    name = s["name"]
    type_ = s["type"]
    seq = s["seq"]
    print("> " +FG_RED+ name +" "+FG_BLUE+ type_ +END_COLOR)
    
    t_start = time()
    s["spec_k3"] = kmer_spectrum(k, s['seq'])
    running_dist = mproc_running_dist(
        k, s["seq"], s["spec_k3"], win_len, step=step, n_process=n_process
    )
    running_dist = np.array(running_dist)
    running_dist = (running_dist -np.mean(running_dist))/np.std(running_dist)
    bp = np.array(range(0, len(running_dist)*step, step)) + int(win_len) // 2
              
    filt_win = 100
    filt_d = np.array(running_average(running_dist, filt_win))
    filt_bp = bp[filt_win//2:-filt_win//2]
    
    filt_seq = seq[:filt_bp[0]]
    for i in range(len(filt_bp)):
        if i == len(filt_bp) -1:
            break
        if filt_d[i] > th and filt_d[i+1] > th:
            filt_seq += seq[filt_bp[i]:filt_bp[i+1]]
    filt_seq += seq[filt_bp[-1]:]
    
    s["filt_spec_k3"] = kmer_spectrum(k, filt_seq)
    t_stop = time()
    
    print(
    FG_GRAY+ "--| " +FG_BLUE+ "DONE"
    +sep+ "K: " +FG_GREEN+ str(k)
    +sep+ "Win: " +FG_GREEN+ str(win_len)
    +sep+ "Step: " +FG_GREEN+ str(step)
    +sep+ "Time: " +FG_GREEN+ "{:.2f}".format(t_stop - t_start)
    +END_COLOR+ "sec"
    )


> [31mAlcaligenes_aquatilis [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m3 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m4.00[0msec
> [31mAeromonas_australiensis [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m3 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m6.38[0msec
> [31mCampylobacter_jejuni [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m3 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m1.63[0msec
> [31mCaulobacter_flavus [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m3 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m5.73[0msec
> [31mCyanobacterium_aponinum [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m3 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m7.50[0msec
> [31mBacillus_cereus [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m3 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Tim

In [3]:
k = 9
win_len = 200
step = 20
n_process = 3
th = 1

sep = " "+FG_GRAY+"|"+END_COLOR+" "

for s in data:
    name = s["name"]
    type_ = s["type"]
    seq = s["seq"]
    print("> " +FG_RED+ name +" "+FG_BLUE+ type_ +END_COLOR)
    
    t_start = time()
    s["spec_k9"] = kmer_spectrum(k, s['seq'])
    running_dist = mproc_running_dist(
        k, s["seq"], s["spec_k9"], win_len, step=step, n_process=n_process
    )
    running_dist = np.array(running_dist)
    running_dist = (running_dist -np.mean(running_dist))/np.std(running_dist)
    bp = np.array(range(0, len(running_dist)*step, step)) + int(win_len) // 2
              
    filt_win = 100
    filt_d = np.array(running_average(running_dist, filt_win))
    filt_bp = bp[filt_win//2:-filt_win//2]
    
    filt_seq = seq[:filt_bp[0]]
    for i in range(len(filt_bp)):
        if i == len(filt_bp) -1:
            break
        if filt_d[i] > th and filt_d[i+1] > th:
            filt_seq += seq[filt_bp[i]:filt_bp[i+1]]
    filt_seq += seq[filt_bp[-1]:]
    
    s["filt_spec_k9"] = kmer_spectrum(k, filt_seq)
    del s["spec_k9"]
    t_stop = time()
    
    print(
    FG_GRAY+ "--| " +FG_BLUE+ "DONE"
    +sep+ "K: " +FG_GREEN+ str(k)
    +sep+ "Win: " +FG_GREEN+ str(win_len)
    +sep+ "Step: " +FG_GREEN+ str(step)
    +sep+ "Time: " +FG_GREEN+ "{:.2f}".format(t_stop - t_start)
    +END_COLOR+ "sec"
    )


> [31mAlcaligenes_aquatilis [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m9 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m29.68[0msec
> [31mAeromonas_australiensis [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m9 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m28.84[0msec
> [31mCampylobacter_jejuni [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m9 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m11.15[0msec
> [31mCaulobacter_flavus [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m9 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m37.72[0msec
> [31mCyanobacterium_aponinum [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m9 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0m Time: [32m29.18[0msec
> [31mBacillus_cereus [34mbacteria[0m
[90m--| [34mDONE [90m|[0m K: [32m9 [90m|[0m Win: [32m200 [90m|[0m Step: [32m20 [90m|[0

In [5]:
def write_distance_matrix(data, key, output):
    print("Start")
    L = len(data)
    lines = [str(L)+"\n"]
    for i in range(L):
        print("Round", i)
        s_1 = data[i]
        line = s_1["name"][:10] + " "
        for j in range(L):
            s_2 = data[j]
            line += "{:.6f}".format(dist(s_1[key], s_2[key]))
            line += " "
        line += "\n"
        lines.append(line)
    with open(output, "w") as f:
        for line in lines:
            f.write(line)

In [10]:
write_distance_matrix(data, "spec_k9", "data/k9_distance_mat.txt")

Start
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [6]:
write_distance_matrix(data, "filt_spec_k9", "data/filt_k9_distance_mat.txt")

Start
Round 0
Round 1
Round 2
Round 3
Round 4
Round 5
Round 6
Round 7
Round 8
Round 9
Round 10
Round 11
Round 12
Round 13
Round 14
Round 15
Round 16
Round 17
Round 18
Round 19
Round 20
Round 21
Round 22
Round 23
Round 24
Round 25
Round 26
Round 27
Round 28
Round 29
Round 30
