In [3]:
def validate_hmm(model):
    "Validates the hmm model"
    
    validate = True
    if not math.isclose(sum(model.init_probs),1):
        validate = False

    for i in range(len(model.trans_probs)):
        count = 0
        for j in range(len(model.trans_probs)):
            count += model.trans_probs[i][j]

        if count == 1:
            validate = True

    for i in range(len(model.emission_probs)):
        count = 0
        for j in range(len(model.emission_probs[0])):
            count += model.emission_probs[i][j]
            if (model.emission_probs[i][j] < 0.0 or model.emission_probs[i][j] > 1.0):
                validate = False

        if count == 1:
            validate & True

    return validate
    pass

def translate_indices_to_observations(indices):
    mapping = ['a', 'c', 'g', 't']
    return ''.join(mapping[idx] for idx in indices)

def translate_path_to_indices(path):
    return list(map(lambda x: int(x), path))

def translate_indices_to_path(indices):
    return ''.join([str(i) for i in indices])

def translate_observations_to_indices(obs):
    mapping = {'a': 0, 'c': 1, 'g': 2, 't': 3}
    return [mapping[symbol.lower()] for symbol in obs]

def read_fasta_file(filename):
        """
        Reads the given FASTA file f and returns a dictionary of sequences.

        Lines starting with ';' in the FASTA file are ignored.
        """
        sequences_lines = {}
        current_sequence_lines = None
        with open(filename) as fp:
            for line in fp:
                line = line.strip()
                if line.startswith(';') or not line:
                    continue
                if line.startswith('>'):
                    sequence_name = line.lstrip('>')
                    current_sequence_lines = []
                    sequences_lines[sequence_name] = current_sequence_lines
                else:
                    if current_sequence_lines is not None:
                        current_sequence_lines.append(line)
        sequences = {}
        for name, lines in sequences_lines.items():
            sequences[name] = ''.join(lines)
        return sequences

class hmm:
    def __init__(self, init_probs, trans_probs, emission_probs):
        self.init_probs = init_probs
        self.trans_probs = trans_probs
        self.emission_probs = emission_probs
    

class GenomePrediction():
    
    def init_model():
        "Creation of the HMM model chosen"
        init_probs_7_state = [0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 0.00]

        trans_probs_7_state = [
            [0.00, 0.00, 0.90, 0.10, 0.00, 0.00, 0.00],
            [1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
            [0.00, 1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
            [0.00, 0.00, 0.05, 0.90, 0.05, 0.00, 0.00],
            [0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00],
            [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00],
            [0.00, 0.00, 0.00, 0.10, 0.90, 0.00, 0.00],
        ]

        emission_probs_7_state = [
            #   A     C     G     T
            [0.30, 0.25, 0.25, 0.20],
            [0.20, 0.35, 0.15, 0.30],
            [0.40, 0.15, 0.20, 0.25],
            [0.25, 0.25, 0.25, 0.25],
            [0.20, 0.40, 0.30, 0.10],
            [0.30, 0.20, 0.30, 0.20],
            [0.15, 0.30, 0.20, 0.35],
        ]

        hmm_7_state = hmm(init_probs_7_state, trans_probs_7_state, emission_probs_7_state)
        return hmm_7_state
        pass
    
    def log(x):
        if x == 0:
            return float('-inf')
        return math.log(x)

    def compute_w_log(model, x):
        K = len(model.init_probs)
        N = len(x)

        w = make_table(K, N)

        # Base case: fill out w[i][0] for i = 0..k-1
        # ...
        for k in range(K):
            w[k][0] = log(model.init_probs[k]) + log(model.emission_probs[k][x[0]])

        # Inductive case: fill out w[i][j] for i = 0..k, j = 0..n-1
        # ...
        for n in range(1,N):
            for j in range(0,K):
                for i in range(0,k): #i
                    ## take max
                    w[j][n] = np.argmax(w[i][n-1]+log(model.trans_probs[i][j]))

            w[j][n] += log(model.emission_probs[j][x[n]])

        return w

    def opt_path_prob_log(w):
        return max(w[i][-1] for i in range(len(w)))

    def backtrack_log(w):
        N = len(w[0])
        k = len(w[1])

        z = [None] * N
        z[N-1] = (argmax(w[i][-1]) for i in range(k))

        for j in range(N-2,-1,-1):
            z[j] = np.argmax(w[i][j] * log(model.trans_probs[i[z[j+1]]]) for i in range(k))

        return z

    def count_transitions_and_emissions(K, D, x, z):
        """
        Returns a KxK matrix and a KxD matrix containing counts cf. above
        """
        z_indices = translate_path_to_indices(z)
        x_indices = translate_observations_to_indices(x)
        trans_matrix = [ [ 0 for i in range(K) ] for j in range(K) ]
        emi_matrix = [ [ 0 for i in range(K) ] for j in range(D) ]

        for i in range(len(z_indices)-1):
            trans_matrix[z_indices[i]][z_indices[i+1]] += 1 

        for z_i,x_i in zip(z_indices,x_indices):
            emi_matrix[z_indices[z_i]][x_indices[x_i]] += 1

        return trans_matrix,emi_matrix
        pass

    def viterbi_decoding(model,x):
        "Wrapper function for computing w and backtracking"
        w = compute_w_log(model,x)
        z = backtrack_log(w)
        return z
        pass
    
    def training_by_counting(K, D, x, z):
        """
        Returns a HMM trained on x and z cf. training-by-counting.
        """
        init_probs = [0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 0.00]

        trans, emi = count_transitions_and_emissions(K,D,x,z)
        my_hmm = hmm(init_probs,trans,emi)
        return hmm
        pass
        
    

In [4]:
def getGenomeSequences():
    sequences = read_fasta_file("/Users/alexiaborchgrevink/Desktop/AU_MachineLearning/Theoretical Excercises/au_ml18/handin3/Handin3_Class1_14/data-handin3/genome1.fa")
    return sequences

genome_pred = GenomePrediction()
hmm_7_state = genome_pred.init_model()
g1 = read_fasta_file('/Users/alexiaborchgrevink/Desktop/AU_MachineLearning/Theoretical Excercises/au_ml18/handin3/Handin3_Class1_14/data-handin3/genome1.fa')
g1['genome1'][:]
X = translate_observations_to_indices(g1['genome1'])

w1 = genome_pred.compute_w_log(hmm_7_state,X)
print(w1)


NameError: name 'hmm_7_state' is not defined