# betteridiot Office Hours
## Today's Topic: Hidden Markov Models (HMM)

![](https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/HMMGraph.svg/2000px-HMMGraph.svg.png)
Taken from https://en.wikipedia.org/wiki/Hidden_Markov_model

## A Quick Illustration

___
# Examples from the [Class GitHub](https://github.com/dcmb-courses/bioinf529-winter2019/blob/master/classes/class_7/class7.ipynb)

## The Setup

In [129]:
# The letters used for our sequences
alphabet = 'ACGT'

In [130]:
# The unseen states inferred by observations
hidden_states = 'IG'

In [131]:
# Where do we start
initial_probabilities = {
    'I' : 0.1,
    'G' : 0.9
}

In [132]:
# What are the conditions in which we change
transition_probabilities = {
    'I': { 'I' : 0.6, 'G' : 0.4 },
    'G': { 'I' : 0.1, 'G' : 0.9 }
}

In [133]:
# What do we do when we get there
emission_probabilities = {
    'I': { 'A' : 0.1, 'C' : 0.4, 'G' : 0.4, 'T' : 0.1 },
    'G': { 'A' : 0.4, 'C' : 0.1, 'G' : 0.1, 'T' : 0.4 }
}

## Now the data structure to handle this all

In [134]:
import json
import numpy as np

In [167]:
class HMM:
    """Main class for HMM objects

    Atttibutes:
        alphabet (set): The alphabet of emissions
        hidden_states (set): Hidden states in the model
        initial_probs (dict): A dictionary of initial state probabilities (default: None)
        trans_probs (dict of dict): A dictionary of transition probabilities (default: None)
        emit_probs (dict of dict): A dictionary of emission probabilities (default: None)
    """
    
    __all__ = ['alphabet', 'hidden_states', 'trans_probs', 'initial_probs', 'emit_probs', 'viterbi']
    
    def __init__(self, alphabet, hidden_states, β = None, trans_probs = None, emit_probs = None):
        """Instaniates the object

        Args:
            alphabet (str): The alphabet of emissions
            hidden_states (list of str): Hidden states in the model
            β (dict of float): A dictionary of initial state probabilities (default: None)
            trans_probs (dict of dict): A dictionary of transition probabilities (default: None)
            emit_probs (dict of dict): A dictionary of emission probabilities (default: None)
        """
        self.alphabet = set(alphabet)
        self.hidden_states = set(hidden_states)
        self._β = β
        self.initial_probs = {key: np.log10(val) for key, val in β.items()}
        self._t = trans_probs
        self.trans_probs = self._transform_dict(trans_probs)
        self._e = emit_probs
        self.emit_probs = self._transform_dict(emit_probs)
        
    @staticmethod
    def _transform_dict(nested_dict):
        """Transforms a dict of dict of floating point probabilites to log10 equivalent

        Args:
            nested_dict (dict of dict of floats): dictionary of probabilities wrt hidden state

        Returns:
            out_dict (dict of dict of floats): log10 transformed probabilities
        """
        out_dict = {}
        for key_outer, sub_dict in nested_dict.items():
            for key_inner, val in sub_dict.items():
                out_dict.setdefault(key_outer, {}).update({key_inner: np.log10(val)})
        return out_dict
    
    def __str__(self):
        out_text = [f'Alphabet: {self.alphabet}',
        f'Hidden States: {self.hidden_states}',
        f'Initial Probabilities: {json.dumps(self._β, sort_keys = True, indent = 4)}',
        f'Transition Probabilities: {json.dumps(self._t, sort_keys = True, indent = 4)}',
        f'Emission Probabilities: {json.dumps(self._e, sort_keys = True, indent = 4)}']
        return '\n'.join(out_text)
    
    @classmethod
    def __dir__(cls):
        return cls.__all__
    
    def viterbi(self, sequence):
        """ The Viterbi algorithm for decoding a string using a HMM

        Args:
            sequence (str): Sequence of valid emissions from the HMM

        Returns:
            result (str): optimal path through HMM given the model parameters
                           using the Viterbi algorithm
        """
        traceback = []

        first_base = sequence[0]

        previous = {state: self.initial_probs[state] + self.emit_probs[state][first_base] for state in self.hidden_states}
#         previous = {}
#         for state in self.hidden_states:
#             previous.update({state: self.initial_probs[state] + self.emit_probs[state][first_base]})

        for base in sequence[1:]:
            update_previous, update_tb = self.update_probs(base, previous)

            previous = update_previous
            traceback.append(update_tb)

        result = max(previous, key = previous.get)

        result += self.get_traceback(traceback, result)
        return result[::-1]

    @staticmethod
    def get_traceback(traceback, last_origin):
        tb = ''
        for pos in reversed(traceback):
            prev_origin = pos[last_origin]
            tb += prev_origin
            last_origin = prev_origin
        return tb

    def update_probs(self, base, previous):
        curr_prob = {}
        tb_pos = {}

        for future in self.hidden_states:
            check = {current: previous[current] + self.trans_probs[current][future] for current in self.hidden_states}
            origin = max(check, key = check.get)
            curr_prob.update({future: self.emit_probs[future][base] + check[origin]})
            tb_pos.update({future: origin})

        return curr_prob, tb_pos

# Let's see it at work

In [168]:
model = HMM(alphabet, hidden_states, β=initial_probabilities, 
        trans_probs=transition_probabilities, emit_probs= emission_probabilities)

In [169]:
seq = "ACGCGATCATACTATATTAGCTAAATAGATACGCGCGCGCGCGCGATATATATATATAGCTAATGATCGATTACCCCCCCCCCCAATTA"
print(model.viterbi(seq))

GIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIGGGGG


In [170]:
print('GIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIGGGGG')

GIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIGGGGG


In [171]:
# Exact example from slides
sequence = "ACGCGATC"
print(model.viterbi(sequence))

GIIIIGGG


In [172]:
# A slightly more complex example
sequence = "ACGCGATCATACTATATTAGCTAAATAGATACGCGCGCGCGCGCGATATATATATATAGCTAATGATCGATTACCCCCCCCCCCAATTA"
print(sequence)
print(model.viterbi(sequence))

ACGCGATCATACTATATTAGCTAAATAGATACGCGCGCGCGCGCGATATATATATATAGCTAATGATCGATTACCCCCCCCCCCAATTA
GIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIIIIGGGGGGGGGGGGGGGGGGGGGGGGGGGGIIIIIIIIIIIGGGGG


___
## Extended example

In [173]:
hidden_states = ('Ai', 'Ci', 'Gi', 'Ti', 'Ag', 'Cg', 'Gg', 'Tg')
alphabet = 'ACGT'

initial_probabilities = {
    'Ai' : 0.125,
    'Ci' : 0.125,
    'Gi' : 0.125,
    'Ti' : 0.125,
    'Ag' : 0.125,
    'Cg' : 0.125,
    'Gg' : 0.125,
    'Tg' : 0.125
}

transition_probabilities = {
    'Ai': { 'Ai' : 0.2, 'Ci' : 0.36, 'Gi' : 0.2, 'Ti' : 0.2, 'Ag' : 0.01, 'Cg' : 0.01, 'Gg' : 0.01, 'Tg' : 0.01 },
    'Ci': { 'Ai' : 0.1, 'Ci' : 0.1, 'Gi' : 0.66, 'Ti' : 0.1, 'Ag' : 0.01, 'Cg' : 0.01, 'Gg' : 0.01, 'Tg' : 0.01 },
    'Gi': { 'Ai' : 0.1, 'Ci' : 0.39, 'Gi' : 0.1, 'Ti' : 0.1, 'Ag' : 0.1, 'Cg' : 0.01, 'Gg' : 0.1, 'Tg' : 0.1 },
    'Ti': { 'Ai' : 0.2, 'Ci' : 0.36, 'Gi' : 0.2, 'Ti' : 0.2, 'Ag' : 0.01, 'Cg' : 0.01, 'Gg' : 0.01, 'Tg' : 0.01 },
    'Ag': { 'Ai' : 0.01, 'Ci' : 0.1, 'Gi' : 0.01, 'Ti' : 0.01, 'Ag' : 0.2175, 'Cg' : 0.2175, 'Gg' : 0.2175, 'Tg' : 0.2175 },
    'Cg': { 'Ai' : 0.01, 'Ci' : 0.1, 'Gi' : 0.01, 'Ti' : 0.01, 'Ag' : 0.2175, 'Cg' : 0.2175, 'Gg' : 0.2175, 'Tg' : 0.2175 },
    'Gg': { 'Ai' : 0.01, 'Ci' : 0.1, 'Gi' : 0.01, 'Ti' : 0.01, 'Ag' : 0.2175, 'Cg' : 0.2175, 'Gg' : 0.2175, 'Tg' : 0.2175 },
    'Tg': { 'Ai' : 0.01, 'Ci' : 0.1, 'Gi' : 0.01, 'Ti' : 0.01, 'Ag' : 0.2175, 'Cg' : 0.2175, 'Gg' : 0.2175, 'Tg' : 0.2175 }
}

emission_probabilities = {
    'Ai': { 'A' : 1, 'C' : 0.001, 'G' : 0.001, 'T' : 0.001 },
    'Ci': { 'A' : 0.001, 'C' : 1, 'G' : 0.001, 'T' : 0.001 },
    'Gi': { 'A' : 0.001, 'C' : 0.001, 'G' : 1, 'T' : 0.001 },
    'Ti': { 'A' : 0.001, 'C' : 0.001, 'G' : 0.001, 'T' : 1 },
    'Ag': { 'A' : 1, 'C' : 0.001, 'G' : 0.001, 'T' : 0.001 },
    'Cg': { 'A' : 0.001, 'C' : 1, 'G' : 0.001, 'T' : 0.001 },
    'Gg': { 'A' : 0.001, 'C' : 0.001, 'G' : 1, 'T' : 0.001 },
    'Tg': { 'A' : 0.001, 'C' :0.0010, 'G' : 0.001, 'T' : 1 }
}

model = HMM(alphabet, hidden_states, trans_probs=transition_probabilities, 
            emit_probs=emission_probabilities, β = initial_probabilities)

In [174]:
sequence = "ACGCGATCATACTATATTAGCTAAATAGATACGCGCGCGCGCGCGATATATATATATAGCTAATGATCGATTACCCCCCCCCCCAATTA"

print(sequence)

result = model.viterbi(sequence)
result = result.replace("A", "")
result = result.replace("C", "")
result = result.replace("G", "")
result = result.replace("T", "")
result = result.replace("i", "I")

print(result)

ACGCGATCATACTATATTAGCTAAATAGATACGCGCGCGCGCGCGATATATATATATAGCTAATGATCGATTACCCCCCCCCCCAATTA
IIIIIggggggggggggggggggggggggggIIIIIIIIIIIIIIgggggggggggggggggggggggggggggggggggggggggggg
