# HMM for decryption


In [168]:
from src.CipherUtils import CipherGenerator
from src.CipherUtils import TextEncoder
from src.ProbabilityMatrix import ProbabilityMatrix
from src.CipherUtils import TextPreProcessor

from src.HMM_utils import map_alphabet_to_numbers, string_to_numbers
from src.HMM_utils import find_mapping, numbers_to_string, invert_mapping
from src.HMM_utils import convert_numbers_to_letters

# from src.HMM_functions import Baum_Welch
# from src.HMM_functions import compute_f_log, Viterbi_log, reconstruct

import numpy as np

In [169]:
import random 
def Homophonic_Cipher_Generator(extended_alphabet = list("abcdefghijklmnopqrstuvwxyz1234567890")):
    """
    Generates a permutation assigning to each letter one or potentially two of the symbols in the extended alphabet provided.
    In this simple example it assigns letters as before + it assigns to some letters the numbers as well (can be generalized) 
    The output is kept as a dictionary.
    """
    letters = extended_alphabet.copy()
    letters = letters[0:26]
    random.shuffle(letters)

    letters_list = [letters[i:i+1] for i in range(0, len(letters), 1)]
    for i in range(len(extended_alphabet) - 26):
        index = random.randint(0, 25) # Select one of the existing letters
        while(len(letters_list[index]) > 1):
            index = random.randint(0, 25) # Select one of the existing letters
        letters_list[index].append(i)

    # print(letters_list)
    d = {k:v for k,v in zip(extended_alphabet[0:26], letters_list)}
    return(d)
    

In [170]:
def Encode_using_homophonic(text, cipher_dict):
    """
    Encodes using the homophonic dictionary provided
    We assume text has already been preprocessed to remove all puntctuation ...
    """
    encoded_text = []
    for char in text:
        if char == ' ':
            encoded_text.append(char)
            continue
            
        if len(cipher_dict[char]) == 1:
            encoded_char = cipher_dict[char]
        else:
            encoded_char = str(cipher_dict[char][random.randint(0,len(cipher_dict[char])-1)])  
        
        if isinstance(encoded_char, list):
            encoded_char = str(encoded_char[0])
        encoded_text.append(encoded_char)
    
    
    return "".join(encoded_text)

In [171]:
d = Homophonic_Cipher_Generator()
print(d)
t = Encode_using_homophonic("hello good afternoon and welcome to", d)
t

{'a': ['j'], 'b': ['a', 3], 'c': ['c'], 'd': ['t', 6], 'e': ['n'], 'f': ['h'], 'g': ['m', 9], 'h': ['s', 5], 'i': ['i'], 'j': ['f', 0], 'k': ['p'], 'l': ['l', 2], 'm': ['z'], 'n': ['v'], 'o': ['k', 1], 'p': ['u'], 'q': ['g'], 'r': ['r'], 's': ['e'], 't': ['b', 8], 'u': ['d', 4], 'v': ['o'], 'w': ['w'], 'x': ['x'], 'y': ['y', 7], 'z': ['q']}


'5n22k mkkt jhbnrv1kv jvt wn2ckzn b1'

In [172]:
#hidden_sequence = "Hello how are you doing"
hidden_sequence = "good afternoon today we will show "
preprocessor = TextPreProcessor()
hidden_sequence = preprocessor.lower(text=hidden_sequence)
hidden_sequence = preprocessor.remove_unknown_chars(
    text=hidden_sequence, unknown_chars=preprocessor.unknown_chars(hidden_sequence)
)
hidden_sequence = preprocessor.remove_additional_spaces(text=hidden_sequence)

d = Homophonic_Cipher_Generator()
observed_sequence = Encode_using_homophonic(hidden_sequence, d)
print(hidden_sequence)
print(observed_sequence)

good afternoon today we will show
e22b rndwq0f2k d2brj gw gloo mxfg


In [173]:
#hidden_sequence = "Hello how are you doing"
hidden_sequence = "hello banana xilophone key queue zebra cock pussy tits dandy fart though jolly world mum today I was going through the park and noticed that some people where looking at me in a weird way and thought that I was being silly"

preprocessor = TextPreProcessor()
hidden_sequence = preprocessor.lower(text=hidden_sequence)
hidden_sequence = preprocessor.remove_unknown_chars(
    text=hidden_sequence, unknown_chars=preprocessor.unknown_chars(hidden_sequence)
)
hidden_sequence = preprocessor.remove_additional_spaces(text=hidden_sequence)

d = Homophonic_Cipher_Generator()
observed_sequence = Encode_using_homophonic(hidden_sequence, d)
print(hidden_sequence)
print(observed_sequence)

hello banana xilophone key queue zebra cock pussy tits dandy fart though jolly world mum today i was going through the park and noticed that some people where looking at me in a weird way and thought that i was being silly
sxdd5 obebeb u3dwzs5ex cx9 gr0rx p0okb i5ic zrmmh ql8m abe49 vbkq 8sw6ys jwddh 7wkd4 n6n 85ab9 l fbm y53ey 8sk56ys qsx zbkc bea ewq3i0a qsbq m5nx z05zd0 7s0k0 dw5cl1y bq nx l1 b 7x3ka fb9 b1a qs5rysq 8sb8 l fbm oxl1y m3dd9


In [174]:
# List of text file paths to build our corpus (where we learn the transitions probs)
file_paths = [
    "texts/moby_dick.txt",
    "texts/shakespeare.txt",
    "texts/james-joyce-a-portrait-of-the-artist-as-a-young-man.txt",
    "texts/james-joyce-dubliners.txt",
    "texts/james-joyce-ulysses.txt",
]

texts = []
for file_path in file_paths:
    with open(file_path, "r") as file:
        texts.append(file.read())

corpus = "".join(texts)
alphabet = list("abcdefghijklmnopqrstuvwxyz ")

preprocessor = TextPreProcessor()
corpus = preprocessor.lower(text=corpus)
corpus = preprocessor.remove_unknown_chars(
    text=corpus, unknown_chars=preprocessor.unknown_chars(corpus)
)
corpus = preprocessor.remove_additional_spaces(text=corpus)

# compute probabilities
p = ProbabilityMatrix(corpus)
p.compute_probability_matrix()
p.compute_normalized_matrix()
# p.compute_probability_table()

#### Go on

Here we have to be more careful as we have to encode more.
In our case (used here) 27 hidden states (letters + spaces) and these are emitted to 37 observed states (27 hidden + 10 digits), so we need to be careful in the conversion.


In [175]:
def string_to_numbers(text, mapping):
    """
    Converts a string of characters to a list of numbers based on the provided mapping.

    Args:
        text (str): The input string to be converted.
        mapping (dict): A dictionary mapping characters to numbers.

    Returns:
        list: A list of numbers representing the characters in the input string.
    """
    numbers = [mapping[char] for char in text]
    return numbers

In [176]:
mapping = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9,
           'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18,
           't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25,
           '0': 26, '1': 27, '2': 28, '3': 29, '4': 30, '5': 31, '6': 32, '7': 33, '8': 34,
           '9': 35, ' ': 36}

def string_to_numbers_updated(text, mapping):
    """
    We convert a->0, b->1, ..., z->25, 0->26, 1->27, 2->28, ..., 9->35, ' '-> 36
    """
    converted_string = [mapping[char] for char in text]
    return converted_string

In [177]:
mapping = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9,
           'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18,
           't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25,
           '0': 26, '1': 27, '2': 28, '3': 29, '4': 30, '5': 31, '6': 32, '7': 33, '8': 34,
           '9': 35, ' ': 36}
hidden_ = string_to_numbers_updated(hidden_sequence, mapping = mapping)
print(hidden_)

observed_ = string_to_numbers(observed_sequence, mapping)
print(observed_)

[7, 4, 11, 11, 14, 36, 1, 0, 13, 0, 13, 0, 36, 23, 8, 11, 14, 15, 7, 14, 13, 4, 36, 10, 4, 24, 36, 16, 20, 4, 20, 4, 36, 25, 4, 1, 17, 0, 36, 2, 14, 2, 10, 36, 15, 20, 18, 18, 24, 36, 19, 8, 19, 18, 36, 3, 0, 13, 3, 24, 36, 5, 0, 17, 19, 36, 19, 7, 14, 20, 6, 7, 36, 9, 14, 11, 11, 24, 36, 22, 14, 17, 11, 3, 36, 12, 20, 12, 36, 19, 14, 3, 0, 24, 36, 8, 36, 22, 0, 18, 36, 6, 14, 8, 13, 6, 36, 19, 7, 17, 14, 20, 6, 7, 36, 19, 7, 4, 36, 15, 0, 17, 10, 36, 0, 13, 3, 36, 13, 14, 19, 8, 2, 4, 3, 36, 19, 7, 0, 19, 36, 18, 14, 12, 4, 36, 15, 4, 14, 15, 11, 4, 36, 22, 7, 4, 17, 4, 36, 11, 14, 14, 10, 8, 13, 6, 36, 0, 19, 36, 12, 4, 36, 8, 13, 36, 0, 36, 22, 4, 8, 17, 3, 36, 22, 0, 24, 36, 0, 13, 3, 36, 19, 7, 14, 20, 6, 7, 19, 36, 19, 7, 0, 19, 36, 8, 36, 22, 0, 18, 36, 1, 4, 8, 13, 6, 36, 18, 8, 11, 11, 24]
[18, 23, 3, 3, 31, 36, 14, 1, 4, 1, 4, 1, 36, 20, 29, 3, 22, 25, 18, 31, 4, 23, 36, 2, 23, 35, 36, 6, 17, 26, 17, 23, 36, 15, 26, 14, 10, 1, 36, 8, 31, 8, 2, 36, 25, 17, 12, 12, 7, 36, 16, 1

Now the emission probability has to be a 27 x 36 matrix, since there are more possible emissions than before.
Update the code to reflect that.

In [178]:
def forward_HMM(A, B, pi, observed):
    """
    A: transition
    B: emission
    pi: initial
    n_nodes: number of nodes in the chain
    observed: list containing observed ones.
    """
    n_nodes = len(observed)
    n_states = A.shape[0]
    alpha = np.zeros((n_nodes, n_states))
    c = np.zeros(n_nodes)
    alpha_hat = np.zeros((n_nodes, n_states))

    for j in range(n_states):
        alpha[0, j] = pi[j] * B[j, observed[0]]

    c[0] = np.sum(alpha[0])
    alpha_hat[0] = alpha[0] / np.sum(alpha[0])
    # print("alpa[0]", alpha[0])

    for i in range(1, n_nodes):
        for j in range(n_states):
            for k in range(n_states):
                alpha[i, j] += A[k, j] * B[j, observed[i]] * alpha_hat[i - 1, k]
        c[i] = np.sum(alpha[i])
        alpha_hat[i] = alpha[i] / c[i]
    return alpha_hat, c


def backward_HMM(A, B, observed, c):
    """
    A: transition
    B: emission
    n_nodes: number of nodes in the chain
    observed: list containing observed ones.
    """
    n_nodes = len(observed)
    n_states = A.shape[0]
    beta = np.zeros((n_nodes - 1, n_states))
    beta_hat = np.zeros((n_nodes - 1, n_states))

    for j in range(n_states):
        for k in range(n_states):
            beta[-1, j] += A[j, k] * B[k, observed[n_nodes - 1]]

    beta_hat[-1] = beta[-1] / c[-1]

    for i in range(n_nodes - 3, -1, -1):
        for j in range(n_states):
            for k in range(n_states):
                beta[i, j] += A[j, k] * B[k, observed[i + 1]] * beta_hat[i + 1, k]
        beta_hat[i] = beta[i] / c[i + 1]

    return beta_hat


def compute_all_conditional(alpha, beta):
    """
    alpha: list containing forward messages
    beta: list containing backward messages
    """
    n_nodes = alpha.shape[0]
    n_states = alpha.shape[1]

    gamma = np.zeros((n_nodes, n_states))

    gamma[n_nodes - 1] = alpha[n_nodes - 1] / np.sum(alpha[n_nodes - 1])

    for i in range(n_nodes - 1):
        tmp = alpha[i] * beta[i]
        gamma[i] = tmp / np.sum(tmp)

    return gamma


def divide_row_by_sum(matrix):
    row_sums = np.sum(matrix, axis=1)  # Calculate the sum of each row
    divided_matrix = (
        matrix / row_sums[:, np.newaxis]
    )  # Divide each element by the corresponding row sum
    return divided_matrix


def update_B(gamma, observed):
    # n_nodes = gamma.shape[0]
    n_states = gamma.shape[1]

    B = np.zeros((n_states, 37))

    for i in range(n_states):
        for j in range(37):
            for k in range(len(observed)):
                if observed[k] == j:
                    B[i, j] += gamma[k, i]

    return divide_row_by_sum(B)

In [179]:
def Baum_Welch(A, B_start, pi, observed, maxIter=100, tol = 1e-4):
    B = np.copy(B_start)
    changed = 0 # change is set to 1 whenever at least one coordinate increases by more than tol
    for it in range(maxIter):
        #print("computing alpha")
        alpha_hat, c = forward_HMM(A, B, pi, observed)
        #print("alpha_hat", alpha_hat.sum(axis = 1))
        #print("computing beta")
        beta_hat = backward_HMM(A, B, observed, c)
        #print("beta_hat", beta_hat.sum(axis = 1))
        #print("computing gamma")
        gamma = compute_all_conditional(alpha_hat, beta_hat)
        #print("gamma", gamma.sum(axis = 1))
        B_old = B

        #print("updating B")
        #print(B.sum(axis = 1))
        B = update_B(gamma, observed)
        #print("computing B", B.sum(axis = 1))
        # Check if conerged or still changing
        change = np.abs(B - B_old)
        max_change = np.max(change)

        if(max_change < tol):
            print("Not updating anymore after iteration", it)
            break


        # following lines only for encryption
        B[:, -1] = np.zeros(27)
        B[-1, :] = np.zeros(37)
        B[-1, -1] = 1
    return B

In [186]:
B_start = np.zeros((27, 37)) + 1 / 36
B_start[:, -1] = np.zeros(27) # last column
B_start[-1, :] = np.zeros(37) # last row
B_start[-1, -1] = 1           # last entr

emission = Baum_Welch(
    A=p.normalized_matrix,
    B_start=B_start,
    pi=p.normalized_matrix[-1, :],
    observed=observed_,
    maxIter=100,
)

In [187]:
emission.argmax(axis = 0) # I want the maximum by column to see to what letter each symbol is associated

array([ 3,  8, 12, 10, 25, 22, 19, 24, 15,  5, 17,  0,  9,  1, 15,  1,  5,
       17,  7,  0, 16, 22, 14,  4,  6,  2,  4, 13,  0, 20,  4, 23, 20,  2,
       19, 18, 26])

Now let us proceed to apply the Viterbi algorithm to obtain the most likely reconstruction and compare how it performs with the one obtained just by using the emissions.


In [188]:

def compute_f_log(A, B, observed):
    """
    It constructs the factors of the HMM which are needed to perform the forward pass of the message passing algorithm.
    Input:
        - A : the transition matrix
        - B : the emission matrix
        - observed: an array containing the observed values
    Output:
        - f0: the factor corresponding to the initial factor to first latent variable message
        - f: an array containig the all the other factors (n_states - 1)
    """
    pi = A[-1]
    n_nodes = len(observed)
    n_states = A.shape[0]
    f = np.zeros((n_nodes - 1, n_states, n_states))

    tmp = np.zeros((n_states, 1))
    for k in range(n_states):
        tmp[k] = np.log(pi[k]) + np.log(B[k, observed[0]])

    f0 = tmp

    for i in range(1, n_nodes):
        tmp = np.zeros((n_states, n_states))

        for j in range(n_states):  # over z1
            for k in range(n_states):  # over z2
                tmp[j, k] = np.log(A[j, k]) + np.log(B[k, observed[i]])

        f[i - 1] = tmp

    return f0, f


def Viterbi_log(f0, f):
    """
    Performs the forward pass of the max plus algorithm (known as Viterbi algorithm for Hidden-Markov models).
    Input: 
        - f0: the factor corresponding to the initial factor to first latent variable message
        - f: an array containig the all the other factors (n_states - 1)
    Output:
        - pmax: the array containing the messages of the forward pass
        - phi: the array storing the most probable preceding state stored during the forward pass
    """
    n_nodes = f.shape[0] + 1
    n_states = f.shape[1]

    pmax = np.zeros((n_nodes, n_states))  # Need one for every node
    phi = np.zeros(
        (n_nodes - 1, n_states)
    )  # Need one for every node other than the first one (no need to reconstruct it)

    pmax[0] = f0.flatten()

    for i in range(1, n_nodes):
        tmp = ((f[i - 1]).T + pmax[i - 1]).T

        pmax[i] = np.max(tmp, axis=0)  # by column

        phi[i - 1] = np.argmax(
            tmp, axis=0
        )  # i-1 cause this contains the reconstruction about the (i-1)th element

    return pmax, phi


def reconstruct(pmax, phi):
    """
    Given the output of a max-plus forward pass it returns the most probable hidden states.
    Input:
        - pmax: the array containing the messages of the forward pass
        - phi: the array storing the most probable preceding state stored during the forward pass
    Output:
        - An array of int that coincides with the most probable latent states
    """
    reconstruction = np.empty(len(phi) + 1)

    curr = np.argmax(pmax[-1])
    reconstruction[-1] = curr

    for i in range(len(phi) - 1, -1, -1):
        curr = int(phi[i, curr])
        reconstruction[i] = curr

    return reconstruction



In [189]:
f0, f = compute_f_log(A=p.normalized_matrix, B=emission, observed=observed_)
pmax, phi = Viterbi_log(f0, f)
reconstruction = reconstruct(pmax, phi)
reconstruction = reconstruction.astype(int)
viterbi_reconstruction = convert_numbers_to_letters(reconstruction)

  tmp[k] = np.log(pi[k]) + np.log(B[k, observed[0]])
  tmp[j, k] = np.log(A[j, k]) + np.log(B[k, observed[i]])


In [193]:
print(hidden_sequence)
print(viterbi_reconstruction)

hello banana xilophone key queue zebra cock pussy tits dandy fart though jolly world mum today i was going through the park and noticed that some people where looking at me in a weird way and thought that i was being silly
hello pivind qulowhave mes there bepri pond whemy tate wives wint though folly corke bur towis s wis goung thyough the wind ind fofured thit more wexcke chere loomang it be an i cound wis ind thanghe thit s wis peang jully


In [194]:
np.mean(np.array(list(hidden_sequence.replace(" ", ""))) == np.array(list(viterbi_reconstruction.replace(" ", ""))))

0.5138121546961326

In [195]:
print("True sequence:\n", hidden_sequence)
print("Reconstruction using Viterbi:\n", viterbi_reconstruction)

True sequence:
 hello banana xilophone key queue zebra cock pussy tits dandy fart though jolly world mum today i was going through the park and noticed that some people where looking at me in a weird way and thought that i was being silly
Reconstruction using Viterbi:
 hello pivind qulowhave mes there bepri pond whemy tate wives wint though folly corke bur towis s wis goung thyough the wind ind fofured thit more wexcke chere loomang it be an i cound wis ind thanghe thit s wis peang jully
