# HMM for decryption


In [1]:
from src.CipherUtils import CipherGenerator
from src.CipherUtils import TextEncoder
from prove.src.Probability import ProbabilityMatrix
from src.CipherUtils import TextPreProcessor

from src.HMM_utils import map_alphabet_to_numbers, string_to_numbers
from src.HMM_utils import find_mapping, numbers_to_string, invert_mapping

# from src.HMM_functions import Baum_Welch

import numpy as np

In [38]:
# hidden_sequence = "people of western europe a landing was made this morning on the coast of france by troops kangaroo jokes quasi vile xilophone zenit "
hidden_sequence = " in germany it seems to be pretty much automatic pretty much all the time in france and spain it all depends presumably just on social subtleties that you have to be french or spanish to understand in italy why would you even bother when and how much to tip is a question that has been vexing visitors to europe for as long as people have been travelling around the continent outside their own country it seems even europeans don t know the answer according to new polling by yougov in six eu countries britain and the us where as most visitors know but may be reluctant to acknowledge gratuities may make up more than half your waitperson s income europeans are deeply divided on tipping in restaurants for example of respondents in germany told the pollster they typically tipped almost the same as the us in the uk where an optional service charge of about is usually included said they left a gratuity the figure in spain where service is often included in restaurant bills but diners can leave optional tips was while in france where every price on a restaurant menu already includes for service of people said they generally tipped on top even in sweden where tips are generally not expected the figure was but only of italians said they would typically leave a gratuity after a meal out with a rather greater proportion admitting they never left a cent a startling of respondents in the us however and of germans by far the most in europe confessed they would tip sometimes or often even if the service was terrible indicating that for some tipping is not about quality of service at all the findings of the survey will come as a surprise in germany a country that does not generally think of itself as a nation of happy distributors"
# hidden_sequence = " hello banana xilophone key queue zebra cock pussy tits dandy fart though jolly world mum "

cipher_generator = CipherGenerator()
cipher = cipher_generator.generate_cipher()
encoder = TextEncoder()
observed_sequence = encoder.encode_text(hidden_sequence, cipher=cipher)

print(hidden_sequence)
print(observed_sequence)

 in germany it seems to be pretty much automatic pretty much all the time in france and spain it all depends presumably just on social subtleties that you have to be french or spanish to understand in italy why would you even bother when and how much to tip is a question that has been vexing visitors to europe for as long as people have been travelling around the continent outside their own country it seems even europeans don t know the answer according to new polling by yougov in six eu countries britain and the us where as most visitors know but may be reluctant to acknowledge gratuities may make up more than half your waitperson s income europeans are deeply divided on tipping in restaurants for example of respondents in germany told the pollster they typically tipped almost the same as the us in the uk where an optional service charge of about is usually included said they left a gratuity the figure in spain where service is often included in restaurant bills but diners can leave o

In [39]:
# List of text file paths to build our corpus (where we learn the transitions probs)
file_paths = [
    "texts/moby_dick.txt",
    "texts/shakespeare.txt",
    "texts/james-joyce-a-portrait-of-the-artist-as-a-young-man.txt",
    "texts/james-joyce-dubliners.txt",
    "texts/james-joyce-ulysses.txt",
]

texts = []
for file_path in file_paths:
    with open(file_path, "r") as file:
        texts.append(file.read())

corpus = "".join(texts)
alphabet = list("abcdefghijklmnopqrstuvwxyz ")

preprocessor = TextPreProcessor()
corpus = preprocessor.lower(text=corpus)
corpus = preprocessor.remove_additional_spaces(text=corpus)
corpus = preprocessor.remove_unknown_chars(
    text=corpus, unknown_chars=preprocessor.unknown_chars(corpus)
)

# compute probabilities
pm = ProbabilityMatrix(text=corpus, alphabet=alphabet)
pm.compute_matrix_spaces()

In [40]:
# translate the hidden and observed sequences into numeric list
hidden_ = string_to_numbers(hidden_sequence, mapping=map_alphabet_to_numbers())
observed_ = string_to_numbers(observed_sequence, mapping=map_alphabet_to_numbers())

In [41]:
from src.HMM_functions import Baum_Welch


# We initialize B to be uniform on all character to character conversions
#  with the last row and column (corresponding to the space) are initialized to 0 and 1 only for " " emitting " "
B_start = np.zeros((27, 27)) + 1 / 26
B_start[:, -1] = np.zeros(27)
B_start[-1, :] = np.zeros(27)
B_start[-1, -1] = 1

emission = Baum_Welch(
    A=pm.matrix,
    B_start=B_start,
    pi=pm.matrix[-1, :],
    observed=observed_,
    maxIter=20
)

In [42]:
# go back to letters to see the results
mapping = find_mapping(emission.argmax(axis=1))
numbers_to_string(observed_sequence, invert_mapping(mapping))

' in germany it seems to be pretty muzh automatiz pretty muzh all the time in franze and spain it all depends presumably qust on sozial subtleties that you have to be frenzh or spanish to understand in italy why would you even bother when and how muzh to tip is a auestion that has been vexing visitors to europe for as long as people have been travelling around the zontinent outside their own zountry it seems even europeans don t wnow the answer azzording to new polling by yougov in six eu zountries britain and the us where as most visitors wnow but may be reluztant to azwnowledge gratuities may mawe up more than half your waitperson s inzome europeans are deeply divided on tipping in restaurants for example of respondents in germany told the pollster they typizally tipped almost the same as the us in the uw where an optional servize zharge of about is usually inzluded said they left a gratuity the figure in spain where servize is often inzluded in restaurant bills but diners zan leave 

In [50]:
emission.argmax(axis = 1)

array([12, 13, 20, 24,  9, 17,  3, 18,  1, 11, 19, 21,  4,  2, 10, 15, 11,
        8, 16, 23, 25, 19,  5,  6,  7, 20, 26])

In [32]:
emission.max(axis = 1)

array([0.4204242 , 0.59426932, 0.33614214, 0.40815961, 0.68610573,
       0.30533622, 0.24128589, 0.41049294, 0.41553163, 0.4884047 ,
       0.36529307, 0.34065969, 0.39944858, 0.73295376, 0.3321016 ,
       0.18357834, 0.81663523, 0.48004038, 0.55108382, 0.56548568,
       0.49292914, 0.66443631, 0.23235109, 0.91711861, 0.58556676,
       0.43420175, 1.        ])

In [8]:
from src.HMM_functions import Viterbi

pmax, phi = Viterbi(A=pm.matrix, B=emission, observed=observed_)

**Encrypted text:** \
it eavfqto in baafb nl sa zvanno fdrk qdnlfqnir zvanno fdrk qxx nka nifa it gvqtra qth bzqit in qxx hazathb zvabdfqsxo lt blriqx bdsnxaniab nkqn old kqca nl sa gvatrk lv bzqtibk nl dthavbnqth it inqxo uko uldxh old acat slnkav ukat qth klu fdrk nl niz ib q pdabnilt nkqn kqb saat cajite cibinlvb nl advlza glv qb xlte qb zalzxa kqca saat nvqcaxxite qvldth nka rltnitatn ldnbiha nkaiv lut rldtnvo in baafb acat advlzaqtb hlt n ytlu nka qtbuav qrrlvhite nl tau zlxxite so oldelc it bij ad rldtnviab svinqit qth nka db ukava qb flbn cibinlvb ytlu sdn fqo sa vaxdrnqtn nl qrytluxahea evqndiniab fqo fqya dz flva nkqt kqxg oldv uqinzavblt b itrlfa advlzaqtb qva haazxo hicihah lt nizzite it vabnqdvqtnb glv ajqfzxa lg vabzlthatnb it eavfqto nlxh nka zlxxbnav nkao nozirqxxo nizzah qxflbn nka bqfa qb nka db it nka dy ukava qt lzniltqx bavcira rkqvea lg qsldn ib dbdqxxo itrxdhah bqih nkao xagn q evqndino nka giedva it bzqit ukava bavcira ib lgnat itrxdhah it vabnqdvqtn sixxb sdn hitavb rqt xaqca lzniltqx nizb uqb ukixa it gvqtra ukava acavo zvira lt q vabnqdvqtn fatd qxvaqho itrxdhab glv bavcira lg zalzxa bqih nkao eatavqxxo nizzah lt nlz acat it buahat ukava nizb qva eatavqxxo tln ajzarnah nka giedva uqb sdn ltxo lg inqxiqtb bqih nkao uldxh nozirqxxo xaqca q evqndino qgnav q faqx ldn uink q vqnkav evaqnav zvlzlvnilt qhfinnite nkao tacav xagn q ratn q bnqvnxite lg vabzlthatnb it nka db kluacav qth lg eavfqtb so gqv nka flbn it advlza rltgabbah nkao uldxh niz blfanifab lv lgnat acat ig nka bavcira uqb navvisxa ithirqnite nkqn glv blfa nizzite ib tln qsldn pdqxino lg bavcira qn qxx nka githiteb lg nka bdvcao uixx rlfa qb q bdvzviba it eavfqto q rldtnvo nkqn hlab tln eatavqxxo nkity lg inbaxg qb q tqnilt lg kqzzo hibnvisdnlvb


**runnato con 50 iterazione di BW:** \
 in pqrmany it sqqms to jq nrqtty muzh automatiz nrqtty muzh all thq timq in franzq and snain it all dqnqnds nrqsumajly on sozial sujtlqtiqs that you havq to jq frqnzh or snanish to undqrstand in italy why would you qvqn jothqr whqn and how muzh to tin is a juqstion that has jqqn vqxinp visitors to quronq for as lonp as nqonlq havq jqqn travqllinp around thq zontinqnt outsidq thqir own zountry it sqqms qvqn quronqans don t wnow thq answqr azzordinp to nqw nollinp jy youpov in six qu zountriqs jritain and thq us whqrq as most visitors wnow jut may jq rqluztant to azwnowlqdpq pratuitiqs may mawq un morq than half your waitnqrson s inzomq quronqans arq dqqnly dividqd on tinninp in rqstaurants for qxamnlq of rqsnondqnts in pqrmany told thq nollstqr thqy tynizally tinnqd almost thq samq as thq us in thq uw whqrq an ontional sqrvizq zharpq of ajout is usually inzludqd said thqy lqft a pratuity thq fipurq in snain whqrq sqrvizq is oftqn inzludqd in rqstaurant jills jut dinqrs zan lqavq ontional tins was whilq in franzq whqrq qvqry nrizq on a rqstaurant mqnu alrqady inzludqs for sqrvizq of nqonlq said thqy pqnqrally tinnqd on ton qvqn in swqdqn whqrq tins arq pqnqrally not qxnqztqd thq fipurq was jut only of italians said thqy would tynizally lqavq a pratuity aftqr a mqal out with a rathqr prqatqr nronortion admittinp thqy nqvqr lqft a zqnt a startlinp of rqsnondqnts in thq us howqvqr and of pqrmans jy far thq most in quronq zonfqssqd thqy would tin somqtimqs or oftqn qvqn if thq sqrvizq was tqrrijlq indizatinp that for somq tinninp is not ajout juality of sqrvizq at all thq findinps of thq survqy will zomq as a surnrisq in pqrmany a zountry that doqs not pqnqrally thinw of itsqlf as a nation of hanny distrijutors


in pqrmany it sqqms to bq zrqtty muzh automatiz zrqtty muzh all thq timq in franzq and szain it all dqzqnds zrqsumably on sozial subtlqtiqs that you havq to bq frqnzh or szanish to undqrstand in italy why would you qvqn bothqr whqn and how muzh to tiz is a puqstion that has bqqn vqxinp visitors to qurozq for as lonp as zqozlq havq bqqn travqllinp around thq zontinqnt outsidq thqir own zountry it sqqms qvqn qurozqans don t ynow thq answqr azzordinp to nqw zollinp by youpov in six qu zountriqs britain and thq us whqrq as most visitors ynow but may bq rqluztant to azynowlqdpq pratuitiqs may mayq uz morq than half your waitzqrson s inzomq qurozqans arq dqqzly dividqd on tizzinp in rqstaurants for qxamzlq of rqszondqnts in pqrmany told thq zollstqr thqy tyzizally tizzqd almost thq samq as thq us in thq uy whqrq an oztional sqrvizq zharpq of about is usually inzludqd said thqy lqft a pratuity thq fipurq in szain whqrq sqrvizq is oftqn inzludqd in rqstaurant bills but dinqrs zan lqavq oztional tizs was whilq in franzq whqrq qvqry zrizq on a rqstaurant mqnu alrqady inzludqs for sqrvizq of zqozlq said thqy pqnqrally tizzqd on toz qvqn in swqdqn whqrq tizs arq pqnqrally not qxzqztqd thq fipurq was but only of italians said thqy would tyzizally lqavq a pratuity aftqr a mqal out with a rathqr prqatqr zrozortion admittinp thqy nqvqr lqft a zqnt a startlinp of rqszondqnts in thq us howqvqr and of pqrmans by far thq most in qurozq zonfqssqd thqy would tiz somqtimqs or oftqn qvqn if thq sqrvizq was tqrriblq indizatinp that for somq tizzinp is not about puality of sqrvizq at all thq findinps of thq survqy will zomq as a surzrisq in pqrmany a zountry that doqs not pqnqrally thiny of itsqlf as a nation of hazzy distributors
