In [1]:
import math
import numpy as np

## Define states and mapping

In [3]:
states = ['E', '5', 'I']
state_to_idx = {s: i for i, s in enumerate(states)}

## Transition Probabilities in Hidden Markov Models (HMM)

### 🔹 Definition
Transition probabilities describe the likelihood of moving from one hidden state to another in an HMM.

They are written as:

$$
P(s_t = j \mid s_{t-1} = i)
$$

Where:

- \( s_t \): hidden state at time \( t \)
- \( s_{t-1} \): hidden state at time \( t-1 \)

---

### 🔹 Key Points

- They model how the system changes over time by capturing dependencies between consecutive hidden states.
- For any given current state \( i \), the total probability of transitioning to all possible next states \( j \) must sum to 1:

$$
\sum_{j} P(s_t = j \mid s_{t-1} = i) = 1
$$

---



## Transition matrix T[k][j] = P(from k to j)

In [6]:
T = {
    'E': {'E': 0.9, '5': 0.1},
    '5': {'I': 1.0},
    'I': {'I': 0.9, 'end': 0.1},
}
print(T)

{'E': {'E': 0.9, '5': 0.1}, '5': {'I': 1.0}, 'I': {'I': 0.9, 'end': 0.1}}


## Emission Probabilities in Hidden Markov Models (HMM)

### 🔹 Definition
Emission probabilities represent the likelihood of observing a specific symbol (e.g., a nucleotide like A, C, G, or T) given the current hidden state.

They are written as:

$$
P(o_t = k \mid s_t = i)
$$

Where:

- \( o_t \): observed symbol at time \( t \)
- \( s_t \): hidden state at time \( t \)

---

### 🔹 Key Points

- Emission probabilities link the **hidden states** to the **observable data**.
- For any given state \( i \), the sum of emission probabilities over all possible symbols \( k \) must be 1:

$$
\sum_k P(o_t = k \mid s_t = i) = 1
$$

---



## Emission prob matrix

In [9]:
E = {
    'E': {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25},
    '5': {'A': 0.05, 'C': 0.0, 'G': 0.95, 'T': 0.0},
    'I': {'A': 0.4, 'C': 0.1, 'G': 0.1, 'T': 0.4}
}
print(E)

{'E': {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}, '5': {'A': 0.05, 'C': 0.0, 'G': 0.95, 'T': 0.0}, 'I': {'A': 0.4, 'C': 0.1, 'G': 0.1, 'T': 0.4}}


## Calculating log probability of a given state path for the sequence.

### start state prob

In [12]:
start = {'E': 1.0, '5': 0.0, 'I': 0.0}

In [13]:
def log(x):
    return -math.inf if x == 0 else math.log(x)

def path_log_prob(path: str, seq: str) -> float:
    if len(path) != len(seq):
        raise ValueError("Path and sequence must be of the same length")

    prob = 0.0
    for i in range(len(seq)):
        curr_state = path[i]
        curr_symbol = seq[i]
        if i == 0:
            prob += log(start[curr_state])
        else:
            prev_state = path[i-1]
            prob += log(T[prev_state][curr_state])
        prob += log(E[curr_state][curr_symbol])
    
    if path[-1] == 'I':
        prob += log(T['I']['end'])

    return prob
# Test path and DNA sequence
path = "EEEEEEEEEEEEEEEEEE5IIIIIII"
sequence = "CTTCATGTGAAAGCAGACGTAAGTCA"

print("Log probability of given path:", round(path_log_prob(path, sequence), 2))


Log probability of given path: -41.22


# Viterbi Algorithm-Implementation

In [15]:
viterbi = [{}]
path = {}

for s in states:
    viterbi[0][s] = log(start[s]) + log(E[s].get(sequence[0], 0))
    path[s] = [s]

# Viterbi recursion
for i in range(1, len(sequence)):
    viterbi.append({})
    newpath = {}

    for curr_state in states:
        max_prob = -math.inf
        best_prev_state = None

        for prev_state in states:
            trans_p = T.get(prev_state, {}).get(curr_state, 0)

            if trans_p > 0:
                emit_p = E[curr_state].get(sequence[i], 0)
                if emit_p > 0:
                    prob = viterbi[i-1][prev_state] + log(trans_p) + log(emit_p)

                    if prob > max_prob:
                        max_prob = prob
                        best_prev_state = prev_state

        viterbi[i][curr_state] = max_prob

        if best_prev_state is not None:
            newpath[curr_state] = path[best_prev_state] + [curr_state]
        else:
            newpath[curr_state] = [curr_state]

    path = newpath

n = len(sequence) - 1
(prob, state) = max((viterbi[n][s], s) for s in states)

logp = round(prob, 2)
best_path = ''.join(path[state])

print("Viterbi best log probability:", logp)
print("Most likely path:", best_path)

Viterbi best log probability: -38.68
Most likely path: EEEEEEEEEEEEEEEEEEEEEEEEEE
