# HMM for decryption


In [1]:
from src.CipherUtils import CipherGenerator
from src.CipherUtils import TextEncoder
from src.ProbabilityMatrix import ProbabilityMatrix
from src.CipherUtils import TextPreProcessor

from src.HMM_utils import map_alphabet_to_numbers, string_to_numbers
from src.HMM_utils import find_mapping, numbers_to_string, invert_mapping
from src.HMM_utils import convert_numbers_to_letters

from src.HMM_functions import Baum_Welch
from src.HMM_functions import compute_f_log, Viterbi_log, reconstruct 

import numpy as np

In [2]:
# RICORDATI DI LEVARE LO SPAZIO
hidden_sequence = "people of western europe a landing was made this morning on the coast of france by troops kangaroo jokes quasi vile xilophone zenit "
hidden_sequence = "in germany it seems to be pretty much automatic pretty much all the time in france and spain it all depends presumably on social subtleties that you have to be french or spanish to understand in italy why would you even bother when and how much to tip is a question that has been vexing visitors to europe for as long as people have been travelling around the continent outside their own country it seems even europeans don t know the answer according to new polling by yougov in six eu countries britain and the us where as most visitors know but may be reluctant to acknowledge gratuities may make up more than half your waitperson s income europeans are deeply divided on tipping in restaurants for example of respondents in germany told the pollster they typically tipped almost the same as the us in the uk where an optional service charge of about is usually included said they left a gratuity the figure in spain where service is often included in restaurant bills but diners can leave optional tips was while in france where every price on a restaurant menu already includes for service of people said they generally tipped on top even in sweden where tips are generally not expected the figure was but only of italians said they would typically leave a gratuity after a meal out with a rather greater proportion admitting they never left a cent a startling of respondents in the us however and of germans by far the most in europe confessed they would tip sometimes or often even if the service was terrible indicating that for some tipping is not about quality of service at all the findings of the survey will come as a surprise in germany a country that does not generally think of itself as a nation of happy distributors"
# hidden_sequence = " hello banana xilophone key queue zebra cock pussy tits dandy fart though jolly world mum "

preprocessor = TextPreProcessor()
hidden_sequence = preprocessor.lower(text=hidden_sequence)
hidden_sequence = preprocessor.remove_unknown_chars(
    text=hidden_sequence, unknown_chars=preprocessor.unknown_chars(hidden_sequence)
)
hidden_sequence = preprocessor.remove_additional_spaces(text=hidden_sequence)
### WARNING: remember di mettere sempre preprocessor.remove_additional_spaces(text) come ultimo step del preprocessing


cipher_generator = CipherGenerator()
cipher = cipher_generator.generate_cipher()
encoder = TextEncoder()
observed_sequence = encoder.encode_text(hidden_sequence, cipher=cipher)

print(hidden_sequence)
print(observed_sequence)

in germany it seems to be pretty much automatic pretty much all the time in france and spain it all depends presumably on social subtleties that you have to be french or spanish to understand in italy why would you even bother when and how much to tip is a question that has been vexing visitors to europe for as long as people have been travelling around the continent outside their own country it seems even europeans don t know the answer according to new polling by yougov in six eu countries britain and the us where as most visitors know but may be reluctant to acknowledge gratuities may make up more than half your waitperson s income europeans are deeply divided on tipping in restaurants for example of respondents in germany told the pollster they typically tipped almost the same as the us in the uk where an optional service charge of about is usually included said they left a gratuity the figure in spain where service is often included in restaurant bills but diners can leave optiona

In [3]:
# List of text file paths to build our corpus (where we learn the transitions probs)
file_paths = [
    "texts/moby_dick.txt",
    "texts/shakespeare.txt",
    "texts/james-joyce-a-portrait-of-the-artist-as-a-young-man.txt",
    "texts/james-joyce-dubliners.txt",
    "texts/james-joyce-ulysses.txt",
]

texts = []
for file_path in file_paths:
    with open(file_path, "r") as file:
        texts.append(file.read())

corpus = "".join(texts)
alphabet = list("abcdefghijklmnopqrstuvwxyz ")

preprocessor = TextPreProcessor()
corpus = preprocessor.lower(text=corpus)
corpus = preprocessor.remove_unknown_chars(
    text=corpus, unknown_chars=preprocessor.unknown_chars(corpus)
)
corpus = preprocessor.remove_additional_spaces(text=corpus)

# compute probabilities
p = ProbabilityMatrix(corpus)
p.compute_probability_matrix()
p.compute_normalized_matrix()
# p.compute_probability_table()

#### Following line to check consistency of probability_table (the dictionary) and probability_matrix (the matrix)


In [4]:
p1 = ProbabilityMatrix(corpus)
p1.compute_probability_table()

In [5]:
dict(sorted(p1.probability_table.items(), key=lambda x: x[0]))

{'  ': 0,
 ' a': 0.018858592409560535,
 ' b': 0.00960800108775822,
 ' c': 0.007487019288144414,
 ' d': 0.007106866375945675,
 ' e': 0.003914266653363149,
 ' f': 0.007627390964955013,
 ' g': 0.00420442518042896,
 ' h': 0.01346651035033249,
 ' i': 0.012399905701535579,
 ' j': 0.0008383172615099884,
 ' k': 0.0016677084494945641,
 ' l': 0.006325651530002367,
 ' m': 0.010127058377009956,
 ' n': 0.004986006851311668,
 ' o': 0.010513080488239103,
 ' p': 0.005886562077539945,
 ' q': 0.0005213805138679391,
 ' r': 0.0034172188604754444,
 ' s': 0.01616964332877492,
 ' t': 0.0275824231183837,
 ' u': 0.0019881911715508185,
 ' v': 0.0012366891456989531,
 ' w': 0.01252120248149735,
 ' x': 6.358298949609013e-06,
 ' y': 0.00420405835548956,
 ' z': 3.5704294101650615e-05,
 'a ': 0.00467004830350802,
 'aa': 2.4699545919635014e-05,
 'ab': 0.001020751531371847,
 'ac': 0.001997728619975232,
 'ad': 0.0025095716854187577,
 'ae': 0.0001499091252350125,
 'af': 0.000505117941554516,
 'ag': 0.0011926701529708908,

In [6]:
def compare(prob1, prob2, acc=10):
    # prob1 deve essere matrix
    # prob2 deve essere dict
    i = 0  # number of errors (should be 0 :))
    errors = []
    error_letter = []
    for x in "abcdefghijklmnopqrstuvwxyz ":
        for y in "abcdefghijklmnopqrstuvwxyz ":
            if round(prob1.get_probability_mat(x, y), acc) != round(
                prob2.get_probability(x + y), acc
            ):
                i += 1
                errors.append(
                    prob1.get_probability_mat(x, y) - prob2.get_probability(x + y)
                )
                error_letter.append([x, y])

            elif round(prob1.get_probability_mat(y, x), acc) != round(
                prob2.get_probability(y + x), acc
            ):
                i += 1
                errors.append(
                    prob1.get_probability_mat(x, y) - prob2.get_probability(x + y)
                )
                error_letter.append([y, x])

    return i, errors, error_letter


i, errors, error_letter = compare(p, p1)
error_letter

[]

#### Go on


In [7]:
# translate the hidden and observed sequences into numeric list
hidden_ = string_to_numbers(hidden_sequence, mapping=map_alphabet_to_numbers())
observed_ = string_to_numbers(observed_sequence, mapping=map_alphabet_to_numbers())

In [8]:
B_start = np.zeros((27, 27)) + 1 / 26
B_start[:, -1] = np.zeros(27)
B_start[-1, :] = np.zeros(27)
B_start[-1, -1] = 1

emission = Baum_Welch(
    A=p.normalized_matrix,
    B_start=B_start,
    pi=p.probability_matrix[-1, :],
    observed=observed_,
    maxIter=100,
)

In [9]:
emission.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [11]:
# go back to letters to see the results
mapping = find_mapping(emission.argmax(axis=1))
emission_reconstruction = numbers_to_string(observed_sequence, invert_mapping(mapping))
print(emission_reconstruction)


in jqrmany it sqqms to bq prqtty muzh automatiz prqtty muzh all thq timq in franzq and spain it all dqpqnds prqsumably on sozial subtlqtiqs that you havq to bq frqnzh or spanish to undqrstand in italy why would you qvqn bothqr whqn and how muzh to tip is a suqstion that has bqqn vqxinj visitors to quropq for as lonj as pqoplq havq bqqn travqllinj around thq zontinqnt outsidq thqir own zountry it sqqms qvqn quropqans don t know thq answqr azzordinj to nqw pollinj by youjov in six qu zountriqs britain and thq us whqrq as most visitors know but may bq rqluztant to azknowlqdjq jratuitiqs may makq up morq than half your waitpqrson s inzomq quropqans arq dqqply dividqd on tippinj in rqstaurants for qxamplq of rqspondqnts in jqrmany told thq pollstqr thqy typizally tippqd almost thq samq as thq us in thq uk whqrq an optional sqrvizq zharjq of about is usually inzludqd said thqy lqft a jratuity thq fijurq in spain whqrq sqrvizq is oftqn inzludqd in rqstaurant bills but dinqrs zan lqavq optiona

Now let us proceed to apply the Viterbi algorithm to obtain the most likely reconstruction and compare how it performs with the one obtained just by using the emissions.

In [14]:
f0, f = compute_f_log(A=p.normalized_matrix, B=emission, observed=observed_)
pmax, phi = Viterbi_log(f0, f)
reconstruction = reconstruct(pmax, phi)
reconstruction = reconstruction.astype(int)
viterbi_reconstruction = convert_numbers_to_letters(reconstruction)

  tmp[k] = np.log(pi[k]) + np.log(B[k, observed[0]])
  tmp[j, k] = np.log(A[j, k]) + np.log(B[k, observed[i]])


In [16]:
print("emission accuracy:",
    np.mean(np.array(list(emission_reconstruction))== np.array(list(hidden_sequence)))
)

print("Viterbi accuracy:",
    np.mean(np.array(list(viterbi_reconstruction))== np.array(list(hidden_sequence)))
)

emission accuracy: 0.8581314878892734
Viterbi accuracy: 0.9619377162629758


In [23]:
print("True sequence:\n",hidden_sequence)
print("Reconstruction from emission:\n",emission_reconstruction)
print("Reconstruction using Viterbi:\n",viterbi_reconstruction)

True sequence:
 in germany it seems to be pretty much automatic pretty much all the time in france and spain it all depends presumably on social subtleties that you have to be french or spanish to understand in italy why would you even bother when and how much to tip is a question that has been vexing visitors to europe for as long as people have been travelling around the continent outside their own country it seems even europeans don t know the answer according to new polling by yougov in six eu countries britain and the us where as most visitors know but may be reluctant to acknowledge gratuities may make up more than half your waitperson s income europeans are deeply divided on tipping in restaurants for example of respondents in germany told the pollster they typically tipped almost the same as the us in the uk where an optional service charge of about is usually included said they left a gratuity the figure in spain where service is often included in restaurant bills but diners c