In [31]:
import torch
import numpy as np

# Data Engineering

Suppose we are given amino acid sequence s and class label sequence c. Let n be the length of both sequences. 
For simplicity, we assume there are 20 amino acids. We create a bijection mapping from each amino acid to [1, 20]. We map each amino acid sequence s to a n sized vector $s'\in R^n$, where the $i^{th}$ position is its corresponding numerical mapping from its amino acid.

In a similar fashion, we perform the same process for secondary structures. We assume there are two types of secondary structures alpha helix and beta sheet and optionally no secondary structure. We map each class label sequence c to a n sized vector $c' \in R^n$, where the $i^{th}$ position is its corresponding numerical mapping from its class label.

In [32]:
def aa_to_vec(s):
    amino_acids = list("AGILPVFWYDERHKSTCMNQ")
    amino_acids.sort()
    aa_mapping = {}
    for i, aa in enumerate(amino_acids):
        aa_mapping[aa] = i + 1
    return list(map(lambda aa: aa_mapping[aa], list(s)))
        

def ss_to_vec(c):
    #TODO
    return list(c)

# shorthand aliases
aa = lambda s: aa_to_vec(s)
ss = lambda c: ss_to_vec(c)

# Model Parameterization

For notion, let $\theta$ := hidden markov model parameters (state transition probabilities, symbol emission probabilities), and let $\phi$ := class emission parameters.

Our objective function attempts to find maximize the conditionally probability of obtaining class label sequence c given amino acid sequence s, hidden markov parameters $\theta$, class emission parameters $\phi$.

The number of hidden states usually requires some expert insights. Here, we adopt the hidden markov model setup introduced in assignment two - which includes two hidden states A and B. Then, we have a 4 state transisition probabilities $(t_{aa}, t_{ab}, t_{ba}, t_{bb})$, 20 symbol emission probabilties for state A $(e_{a1}, ... , e_{a20})$, 20 symbol emission probabilties for state B $(e_{b1}, ... , e_{b20})$. Then, we have 44 parameters total for $\theta$.


In this setup, we also have 3 class emission probabilities for state A $(\phi_{a1}, \phi_{a2} , \phi_{a3})$, 3 class emission probabilities for state B $(\phi_{b1}, \phi_{b2} , \phi_{b3})$. 

For simplicity, we initalize these variables to a uniform distribution.

In [38]:
class HiddenMarkovModelParameters:
    def __init__(self, theta):
        self.update_theta(theta)
        
        
    def update_theta(self, new_theta):
        # Update transition probabilities
        self.trans_probs = np.array(new_theta[:4]).reshape(2, 2)

        # Update emission probabilities for state A and state B
        self.emiss_probs = np.zeros((2, 20))  # Assuming 20 emissions for each state
        self.emiss_probs[0, :] = new_theta[4:24]
        self.emiss_probs[1, :] = new_theta[24:]   
        
    def theta(self):
        return np.concatenate((self.trans_probs.flatten(), self.emiss_probs.flatten()))
        

# Gradient Calculations

We can convert our objective function into a minimization problem by changing our objective into minimizing the negative log likelihood.

$$\hat{\theta} = \arg \max_{\theta} P(c | s, \theta, \phi) = \frac{P(c, s | \theta, \phi)}{P(s | \theta)}$$
$$\hat{\theta} = \arg \min_{\theta} -\log(\frac{P(c, s | \theta, \phi)}{P(s | \theta)})$$

We can calculate the gradient of this expression into terms that we know from forward-backward algorithms.

$$\frac{dL}{d\theta_k} = -\frac {m_k(c, s) - n_k(s)}{\theta_k}$$

$$n_k(s) := idk$$

$$m_k(c, s) := idk$$

In [39]:
class HiddenMarkovModel(HiddenMarkovModelParameters):
    def __init__(self, theta):
        super().__init__(theta)
    
    
    def forward_backward(self, obs, class_labels=None):
        num_states = 2
        num_observations = len(obs)

        forward = np.zeros((num_states, num_observations))
        backward = np.zeros((num_states, num_observations))
        n_k = np.zeros(4 + 2 * 20)  # Transition + Emission probabilities
        m_k = np.zeros_like(n_k) if class_labels is not None else None

        # Forward Pass
        forward[:, 0] = self.emiss_probs[:, obs[0]] / num_states
        for t in range(1, num_observations):
            for s in range(num_states):
                for prev_s in range(num_states):
                    forward[s, t] += forward[prev_s, t-1] * self.trans_probs[prev_s, s] * self.emiss_probs[s, obs[t]]
                    n_k_index = prev_s * num_states + s
                    n_k[n_k_index] += forward[prev_s, t-1] * self.trans_probs[prev_s, s]
                    if class_labels is not None and class_labels[t] == s:
                        m_k[n_k_index] += forward[prev_s, t-1] * self.trans_probs[prev_s, s]

                n_k[4 + s * 20: 4 + (s + 1) * 20] += self.emiss_probs[s, obs[t]]
                if class_labels is not None and class_labels[t] == s:
                    m_k[4 + s * 20: 4 + (s + 1) * 20] += self.emiss_probs[s, obs[t]]

        # Backward Pass
        backward[:, -1] = 1
        for t in range(num_observations - 2, -1, -1):
            for s in range(num_states):
                backward[s, t] = np.sum(backward[:, t+1] * self.trans_probs[s, :] * self.emiss_probs[:, obs[t+1]])

        # Normalize n_k and m_k by the total number of observations
        n_k /= num_observations
        if m_k is not None:
            m_k /= num_observations

        return forward, backward, n_k, m_k
    
    
    def calculate_gradient(self, mk, nk):
        # Ensure mk and nk are provided and have the correct shape
        if mk is None or nk is None:
            raise ValueError("mk and nk must be provided and have the correct shape.")

        # Flatten the transition and emission probabilities for gradient calculation
        theta_flat = np.concatenate((self.trans_probs.flatten(), self.emiss_probs.flatten()))

        # Calculate the gradient
        gradient = -(mk - nk) / theta_flat

        return gradient
    


# Gradient Descent

We use gradient descent to minimize our objective function.

We repeat the following operation until convergence. $\theta'=\theta - \alpha \nabla L$. For simplicity, we fix our step size $\alpha$.

In [46]:
class HiddenMarkovModelWithGradientDescent(HiddenMarkovModel):
    def __init__(self, theta):
        super().__init__(theta)
    
    def gradient_descent(self, observations_set, class_labels_set, learning_rate=0.01, iterations=100):
        if len(observations_set) != len(class_labels_set):
            raise ValueError("Each observation sequence must have corresponding class labels.")

        # Initialize the total gradient array
        total_params = len(self.trans_probs.flatten()) + len(self.emiss_probs.flatten())
        total_gradient = np.zeros(total_params)

        for _ in range(iterations):
            # Reset the gradient for this iteration
            total_gradient.fill(0)

            # Accumulate gradients over all observation sequences
            for obs, class_labels in zip(observations_set, class_labels_set):
                _, _, nk, mk = self.forward_backward(obs, class_labels)
                gradient = self.calculate_gradient(mk, nk)
                total_gradient += gradient

            # Average the gradient across all sequences
            avg_gradient = total_gradient / len(observations_set)

            # Update parameters
            theta_flat = np.concatenate((self.trans_probs.flatten(), self.emiss_probs.flatten()))
            theta_flat -= learning_rate * avg_gradient

            # Reshape and update the model parameters
            self.trans_probs = theta_flat[:4].reshape(2, 2)
            self.emiss_probs = theta_flat[4:].reshape(2, 20)

        # After iterations, update the theta attribute to the new values
        self.update_theta(theta_flat)
        
theta = [0.7, 0.3, 0.4, 0.6] + [0.05] * 20 + [0.03] * 20
hmm = HiddenMarkovModelWithGradientDescent(theta)

# Example multiple observation sequences and corresponding class labels
observations_set = [[0, 1, 2, 3, 4], [1, 0, 3, 2, 4]]  # Add more sequences as needed
class_labels_set = [[1, 0, 1, 0, 1], [0, 1, 0, 1, 0]]  # Corresponding class labels

learning_rate = 0.01
iterations = 100

hmm.gradient_descent(observations_set, class_labels_set, learning_rate, iterations)
print()




# Validation Results

Since our dataset if limited, we approximate our validation error using k-cross-folds validation in particular we use LOOCV (leave-one-out-cross-validation). 

In [10]:
def predict(theta, s):
    #TODO
    pass

def error_est(s_data, c_data):
    #TODO
    pass

# Workflow on Human Data

In [None]:
with open("HUMAN_training_data.txt") as f:
    content = f.read().split("\n")
    s_seqs = []
    c_seqs = []
    for i in range(0, len(content), 2):
        s_seqs.append(aa(content[i]))
    for i in range(1, len(content), 2):
        c_seqs.append(ss(content[i]))
    
    human_train = zip(s_seqs, c_seqs)
    
    
        
        
        