In [30]:
import argparse
import os
from itertools import groupby
from hmmlearn import hmm
import gzip as gz
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import plotly.graph_objects as go

In [31]:
NUM_OF_STATS = 3
STATES = {"H": 0, "S": 1, "L": 2}
AMINO_ACIDS = {"A": 0, "R": 1, "N": 2, "D": 3, "C": 4, "Q": 5, "E": 6, "G": 7, "H": 8, "I": 9,
               "L": 10, "K": 11, "M": 12, "F": 13, "P": 14, "S": 15, "T": 16, "W": 17,
               "Y": 18, "V": 19, "B": 20, "Z": 21, "X": 22, "J": 23, "O": 24, "U": 25, }
BAD_AMINO_ACIDS = "OUBZXJ"
EMISSIONS = pd.read_csv("emissions-Table 1.csv", index_col=0).T.to_numpy() / 100
TRANSITIONS = np.array([[12 / 13, 1 / 39, 2 / 39], [1 / 15, 4 / 5, 2 / 15], [0.4, 0.2, 0.4]])
PATH_TO_DATA = "./parsed_sequences" # Local

In [32]:

model = pickle.load(open('models/model_100_expresive_2', 'rb'))


In [31]:
model.transmat_

array([[0.92274772, 0.02672897, 0.05052331],
       [0.01873199, 0.86988976, 0.11137825],
       [0.27234751, 0.29519038, 0.43246211]])

In [4]:
def split_train_test(dir_path: str):
    train_samples, train_lengths = [], []
    test_samples, test_lengths, test_labels = [], [], []

    for file in os.listdir(dir_path):
        flag = np.random.binomial(1, 0.8, 1)[0]
        file = f"{dir_path}/{file}"
        with open(file, 'r') as f:
            content = f.readlines()
            seq, labels = content[0], content[1]
            numeric_seq, numeric_labels = [], []
            for s, l in zip(seq, labels):
                if s not in BAD_AMINO_ACIDS:
                    numeric_seq += [AMINO_ACIDS[s]]
                    numeric_labels += [STATES[l]]
            if flag:
                train_samples.append(np.array(numeric_seq).reshape(-1, 1))
                train_lengths.append(len(numeric_seq))
            else:
                test_samples.append(np.array(numeric_seq).reshape(-1, 1))
                test_lengths.append(len(numeric_seq))
                test_labels.append(numeric_labels)

    return train_samples, train_lengths, test_samples, test_lengths, test_labels

In [7]:
train_samples, train_lengths, test_samples, test_lengths, test_labels = split_train_test(PATH_TO_DATA)


In [25]:
errors_rate = []

for iter in range(1, 100, 2):
    model = pickle.load(open(f"./models/model_{iter}", 'rb'))
    # Predict
    misclassification_error = 0
    for sample, length, label in zip(test_samples, test_lengths, test_labels):
        ll, hidden_states = model.decode(sample, length)
        misclassification_error += np.count_nonzero(hidden_states - label)
    # Save misclassification error
    misclassification_error_rate = misclassification_error / np.sum(test_lengths)
    errors_rate.append(misclassification_error_rate)

In [18]:
errors_rate

[0.6119727276621096,
 0.6116378588644833,
 0.6127320231916111,
 0.6138222936955108,
 0.6127320231916111,
 0.6122764458739102,
 0.6077440356362702,
 0.6075999641768263,
 0.6066304021930012,
 0.6093443969830658,
 0.6059606645977486,
 0.6046990658718076,
 0.6065447380819805,
 0.6019422390262327,
 0.5980211590354221,
 0.6014087852439675,
 0.5990374468979858,
 0.5985351437015463,
 0.5941545925698065,
 0.5910590031033771,
 0.59765513965197,
 0.59436485902413,
 0.5911446672143978,
 0.5897935105542078,
 0.5988544372062597,
 0.5921570612537332,
 0.5958211489114817,
 0.5861333167196876,
 0.5891432420750963,
 0.5898674931955439,
 0.5843966715599045,
 0.5852533126701114,
 0.5900466090640417,
 0.5840228645299961,
 0.5870639404712304,
 0.5868069481381685,
 0.5937652102469853,
 0.5865032299263678,
 0.5930409591265375,
 0.587036683708633,
 0.5871340292893383,
 0.5890342150247063,
 0.5977369099397626,
 0.5893924467617019,
 0.5861099537803183]

In [27]:
line_fig = px.line(x=range(1, 100, 2), y=errors_rate, title="Error Rate as factor of Number of EM iterations")
line_fig.update_layout(xaxis_title="Number of Iterations", yaxis_title="Error rate")
line_fig.show()

In [28]:
model.transmat_

array([[0.9602483 , 0.01081146, 0.02894024],
       [0.00562936, 0.93769984, 0.0566708 ],
       [0.20600458, 0.35547152, 0.4385239 ]])

In [29]:
model.emissionprob_

array([[0.11604726, 0.06908873, 0.02703431, 0.05909781, 0.01066445,
        0.04184487, 0.07342298, 0.07258803, 0.02082532, 0.04905935,
        0.10656241, 0.04209517, 0.01547627, 0.03307583, 0.04827276,
        0.04856511, 0.04913954, 0.01487528, 0.02608413, 0.07618041],
       [0.0585766 , 0.03580741, 0.06186665, 0.06358485, 0.01465742,
        0.03776759, 0.06143821, 0.07095622, 0.01297228, 0.0680161 ,
        0.0789375 , 0.07455218, 0.01166937, 0.04786835, 0.04098186,
        0.06488121, 0.06171235, 0.01696574, 0.04808389, 0.06870423],
       [0.05558465, 0.02137319, 0.03531132, 0.04092923, 0.0127295 ,
        0.01973113, 0.03774932, 0.10188428, 0.11198178, 0.02816059,
        0.05861047, 0.02482359, 0.0863974 , 0.04513045, 0.07732675,
        0.10551957, 0.064131  , 0.00824489, 0.02167206, 0.04270885]])