In [27]:
from signalalign.hiddenMarkovModel import HmmModel
from signalalign.visualization.plot_em_model_distributions import get_covered_bases
from scipy.stats import norm, invgauss, entropy
from scipy.spatial.distance import euclidean
import numpy as np
import pandas as pd


In [22]:
model_path = "/Users/andrewbailey/CLionProjects/personal/projects/ares_rRNA/sa_training/supervised/more_data_ivt_native/template_hmm9.model"
model = HmmModel(model_path, rna=True)

In [29]:
per_pos_acc = "/Users/andrewbailey/CLionProjects/personal/projects/ares_rRNA/sa_training/supervised/test_plotting/per_position/per_position_data.csv"
acc_df = pd.read_csv(per_pos_acc)
reference = "/Users/andrewbailey/CLionProjects/personal/projects/ares_rRNA/reference/yeast_25S_18S.fa"
positions = "/Users/andrewbailey/CLionProjects/personal/projects/ares_rRNA/mod_files/yeast_18S_25S_mods.positions"


In [65]:
def get_kmer_kl_divergence(model, kmer1, kmer2):
    """Get Kullback–Leibler divergence between the HDP and ONT models for a specific kmer"""
    normal_mean1, normal_sd1 = model.get_event_mean_gaussian_parameters(kmer1)
    normal_mean2, normal_sd2 = model.get_event_mean_gaussian_parameters(kmer2)

    ont_normal_dist1 = norm.pdf(model.linspace, normal_mean1, normal_sd1)
    ont_normal_dist2 = norm.pdf(model.linspace, normal_mean2, normal_sd2)

    kl_divergence = entropy(pk=ont_normal_dist2, qk=ont_normal_dist1, base=2)

    return kl_divergence

def get_kmer_mean_delta(model, kmer1, kmer2):
    """Get Kullback–Leibler divergence between the HDP and ONT models for a specific kmer"""
    normal_mean1, normal_sd1 = model.get_event_mean_gaussian_parameters(kmer1)
    normal_mean2, normal_sd2 = model.get_event_mean_gaussian_parameters(kmer2)

    return np.abs(normal_mean1 - normal_mean2)


In [66]:
get_kmer_kl_divergence(model, "AAAAA", "CAGGA")
get_kmer_mean_delta(model, "AAAAA", "CAGGA")

7.016145999999992

In [32]:
covered_bases = get_covered_bases(reference, positions, kmer_length=5, rna=True)

In [46]:
acc_df.head()
acc_df[(acc_df['contig'] == "RDN18-1") 
       & (acc_df['reference_index'] == 105) 
       & (acc_df["strand"] == "+")]

Unnamed: 0,contig,reference_index,strand,variants,aucroc,avg_precision,brier_score
0,RDN18-1,105,+,Tl,0.9371,0.9382,0.1012


In [69]:
delta_values = []
acc_values = []
for x in covered_bases:
    contig = x[0]
    ref_index = x[1]
    strand = "+"
    kmer_sets = x[3]
    if len(ref_index) == 1:
        ref_index = ref_index[0]
        acc_data = acc_df[(acc_df['contig'] == contig) 
               & (acc_df['reference_index'] == ref_index) 
               & (acc_df["strand"] == strand)]
        if len(acc_data) > 0:
#             print(acc_data[["aucroc", "avg_precision", "brier_score"]])
            kl_divs = []
            mean_deltas = []
            for x in kmer_sets:
                sorted_kmers = sorted(x)
                kl_divs.append(get_kmer_kl_divergence(model, sorted_kmers[0], sorted_kmers[1]))
                mean_deltas.append(get_kmer_mean_delta(model, sorted_kmers[0], sorted_kmers[1]))
#             print(mean_deltas)
            acc_values.append(acc_data["aucroc"].iloc[0])
            delta_values.append(max(mean_deltas))
            
print(acc_values, delta_values)


[0.9371, 0.767, 0.9703, 0.9484, 0.9198, 0.8482, 0.7875, 0.9647, 0.8538, 0.8679, 0.7917, 0.8404, 0.8761, 0.9572] [13.42962459139487, 6.222535060215634, 6.300787006998036, 8.921253332159154, 5.3267059730218165, 39.61947151934055, 6.39162060378429, 13.230885654937367, 3.13167961608265, 11.985034670920896, 14.679042506039423, 5.675483475063928, 6.392046544227966, 27.724395762317712]
