In [25]:
import argparse
import os
from itertools import groupby
from hmmlearn import hmm
import gzip as gz
import pandas as pd
import numpy as np
import pickle
import plotly.express as px

In [7]:
NUM_OF_STATS = 3
STATES = {"H": 0, "S": 1, "L": 2}
AMINO_ACIDS = {"A": 0, "R": 1, "N": 2, "D": 3, "C": 4, "Q": 5, "E": 6, "G": 7, "H": 8, "I": 9,
               "L": 10, "K": 11, "M": 12, "F": 13, "P": 14, "S": 15, "T": 16, "W": 17,
               "Y": 18, "V": 19, "B": 20, "Z": 21, "X": 22, "J": 23, "O": 24, "U": 25, }
BAD_AMINO_ACIDS = "OUBZXJ"
EMISSIONS = pd.read_csv("emissions-Table 1.csv", index_col=0).T.to_numpy() / 100
TRANSITIONS = np.array([[12 / 13, 1 / 39, 2 / 39], [1 / 15, 4 / 5, 2 / 15], [0.4, 0.2, 0.4]])
PATH_TO_DATA = "./parsed_sequences" # Local

In [30]:

model = pickle.load(open('models/model_69', 'rb'))


In [31]:
model.transmat_

array([[0.92274772, 0.02672897, 0.05052331],
       [0.01873199, 0.86988976, 0.11137825],
       [0.27234751, 0.29519038, 0.43246211]])

In [6]:
def split_train_test(dir_path: str):
    train_samples, train_lengths = [], []
    test_samples, test_lengths, test_labels = [], [], []

    for file in os.listdir(dir_path):
        flag = np.random.binomial(1, 0.8, 1)[0]
        file = f"{dir_path}/{file}"
        with open(file, 'r') as f:
            content = f.readlines()
            seq, labels = content[0], content[1]
            numeric_seq, numeric_labels = [], []
            for s, l in zip(seq, labels):
                if s not in BAD_AMINO_ACIDS:
                    numeric_seq += [AMINO_ACIDS[s]]
                    numeric_labels += [STATES[l]]
            if flag:
                train_samples.append(np.array(numeric_seq).reshape(-1, 1))
                train_lengths.append(len(numeric_seq))
            else:
                test_samples.append(np.array(numeric_seq).reshape(-1, 1))
                test_lengths.append(len(numeric_seq))
                test_labels.append(numeric_labels)

    return train_samples, train_lengths, test_samples, test_lengths, test_labels

In [9]:
train_samples, train_lengths, test_samples, test_lengths, test_labels = split_train_test(PATH_TO_DATA)


In [32]:
errors_rate = []

# Predict
misclassification_error = 0
errors_rate_per_type = {"H": 0, "S": 0, "L": 0}
num_of_acids_per_type = {"H": 0, "S": 0, "L": 0}
for sample, length, label in zip(test_samples, test_lengths, test_labels):
    ll, hidden_states = model.decode(sample, length)
    for secondary_type in STATES.keys():
        type_labels_indices = np.argwhere(np.array(label) == STATES[secondary_type])
        errors_rate_per_type[secondary_type] += np.count_nonzero(hidden_states[type_labels_indices] - np.array(label)[type_labels_indices])
        num_of_acids_per_type[secondary_type] += len(type_labels_indices)
# Save misclassification error
for secondary_type in STATES.keys():
    errors_rate_per_type[secondary_type] /= num_of_acids_per_type[secondary_type]
print(errors_rate_per_type)

{'H': 0.1185662137497465, 'S': 0.7029346147245581, 'L': 0.9974812456385206}


In [33]:
bar_fig = px.bar(x=errors_rate_per_type.keys(), y=errors_rate_per_type.values())
bar_fig.update_layout()
bar_fig.show()