In [19]:
from src.CipherBreaker import CipherBreaker
from src.CipherUtils import (
    TextDecoder,
    TextEncoder,
    CipherGenerator,
    TextPreProcessor,
)
from src.ProbabilityMatrix import ProbabilityMatrix

In [20]:
# create instances of my beautiful classes

cipher_generator = CipherGenerator()
preprocess = TextPreProcessor()
text_encoder = TextEncoder()
text_decoder = TextDecoder()

In [21]:
# List of text file paths to build our corpus (where we learn the transitions probs)

file_paths = [
    "texts/moby_dick.txt",
    "texts/shakespeare.txt",
    "texts/james-joyce-a-portrait-of-the-artist-as-a-young-man.txt",
    "texts/james-joyce-dubliners.txt",
    "texts/james-joyce-ulysses.txt",
]

texts = []
for file_path in file_paths:
    with open(file_path, "r") as file:
        texts.append(file.read())

corpus = "".join(texts)

In [22]:
# preprocess the text, removing extra-characters

corpus = preprocess.lower(corpus)
unknown_chars = preprocess.unknown_chars(corpus)
# print(unknown_chars)
corpus = preprocess.remove_unknown_chars(corpus, unknown_chars=unknown_chars)
corpus = preprocess.remove_additional_spaces(corpus)

preprocess.save_text(
    corpus
)  # save text after preprocessing inside text_preprocessed.txt

In [23]:
# compute the transition probs

probability_matrix = ProbabilityMatrix(corpus)
probability_matrix.compute_probability_table()

In [24]:
probability_matrix.save_all_2_chars()
probability_matrix.save_probability_table()

In [25]:
# some text to try

text = "There were better sense in the sad mechanic exercise of determining the reason of its absence where it is not. In the novels of the last hundred years there are vast numbers of young ladies with whom it might be a pleasure to fall in love; there are at least five with whom, as it seems to me, no man of taste and spirit can help doing so."
# text = "I do not believe a word of it, my dear. If he had been so very agreeable, he would have talked to Mrs. Long. But I can guess how it was"
# text = "Your plan is a good one,” replied Elizabeth, “where nothing is in question but the desire of being well married; and if I were determined to get a rich husband, or any husband, I dare say I should adopt it. But these are not Jane’s feelings"
# text = "she is not acting by design. As yet she cannot even be certain of the degree of her own regard, nor of its reasonableness. She has known him only a fortnight. She danced four dances with him at Meryton; she saw him one morning at his own house, and has since dined in company with him four times."

In [29]:
# encode and decode a sample text with a generated cipher

text = preprocess.lower(text)
text = preprocess.remove_unknown_chars(
    text, unknown_chars=preprocess.unknown_chars(text)
)
text = preprocess.remove_additional_spaces(text)


cipher1 = cipher_generator.generate_cipher()
cipher2 = cipher_generator.generate_cipher()

encoded_text = text_encoder.encode_text_double_cipher(text, cipher1, cipher2)

decoded_text = text_decoder.decode_text(
    encoded_text, cipher1
)  #  not work with double cipher... given cipher1 as encoding just to see...

print("Original Text:", text)
print("Encoded Text:", encoded_text)
print("Decoded Text:", decoded_text)

Original Text: there were better sense in the sad mechanic exercise of determining the reason of its absence where it is not in the novels of the last hundred years there are vast numbers of young ladies with whom it might be a pleasure to fall in love there are at least five with whom as it seems to me no man of taste and spirit can help doing so
Encoded Text: mlkep okip ypgmpe tpptp qa mlk won fkslvadx kekesdtk wz kkgpizdpqar gfp ekowxa wz qgw vywkasp olpik dm dw awm qp mfk awgkiw wd glp hvwm fqakepk jkoit gfkip vip botm pqfbpew xd nxjpr ivnqkw odgl olxz qm fqulm yk o rhkvtqip mw doii qp iwbp mlpek vip og ipvwm zqbk oqgf ofwz vt dg wkpfw gx zp ax fva wz gvwmk vpn trdedm xva fpir kwqpr tx
Decoded Text: thern aeln jnvtnr snnsn up the oad mexhkpic ererxise of eevnlfinupg vmn reaocp of uvo kjoepxn ahnle it io pot un tme povelo oi vhn qkot muperne yeals vmeln kln bast numbnro ci dcyng lkdueo aivh ahcf ut muzht je a gqeksuln to iall un lobn thnre kln av lnkot fube auvm amof ks iv oenmo vc 

In [None]:
# break the cipher with MCMC using only one starting point
cipher_breaker = CipherBreaker(
    cipher_generator=cipher_generator,
    ciphered_text=encoded_text,
    probability_table=probability_matrix.probability_table,
)

cipher_breaker.break_cipher(iterations=100000, print_interval=100)

In [None]:
cipher_breaker.extract_best(
    n_extract=10, return_likelihood=True
)  # Extract the best 10 decoded messages along with their likelihood

In [15]:
cipher_breaker.extract_best(n_extract=10, return_likelihood=True)[0][0]

'shege gege uessig nerni em the nak bixharex eyenxene ly petinfomord the nearcr cy otr aurirve ghine es er rct er swi mcbidn co swe dans wurkgep ciann thene age wars rubuinn co flurt dapoen jesh ghlf et botws ze a ldianzge sc oaid em ilwi swige ane as iears yobi jotw gwlf ar ot neibr tl fe ml far co santi amp rlonot var wedl klort nl'

In [18]:
import numpy as np

print(
    "MCMC accuracy:",
    np.mean(
        np.array(
            list(
                cipher_breaker.extract_best(n_extract=10, return_likelihood=True)[0][0]
            )
        )
        == np.array(list(text))
    ),
)

MCMC accuracy: 0.4161676646706587


In [33]:
# break the cipher with MCMC using 5 starting points
cipher_breaker_nstart = CipherBreaker(
    cipher_generator=cipher_generator,
    ciphered_text=encoded_text,
    probability_table=probability_matrix.probability_table,
)
cipher_breaker_nstart.break_cipher_nstart(
    iterations=1000000, print_interval=100, nstart=5
)

Iter 0 of start 1: vdsfu csyu bukvuf juuju zr vds wcp xstdnrmi sfsftmjs wh sskuyhmuzre kxu fscwir wh zkw nbwsrtu cduys mv mw rwv zu vxs rwksyw wm kdu anwv xzrsfus qscyj kxsyu nyu ocjv uzxoufw im pique ynpzsw cmkd cdih zv xzgdv bs c easnjzyu vw mcyy zu ywou vdufs nyu ck yunwv hzos czkx cxwh nj mk wsuxw ki hu ri xnr wh knwvs nup jemfmv inr xuye swzue ji
Iter 100 of start 1: suowe come vetsew yeeye rd suo ick hopuldan owowpayo ig ootemgaerdf the wocind ig rti lviodpe cuemo as ai dis re sho ditomi ia tue zlis hrdoweo jocmy thome lme bcys erhbewi na knjef mlkroi catu cung rs hrqus vo c fzolyrme si acmm re mibe suewo lme ct melis grbo crth chig ly at ioehi tn ge dn hld ig tliso lek yfawas nld hemf oiref yn
Iter 200 of start 1: suowe come vetsew yeeye rd suo ick hojuldan owowjayo ig ootemgaerdp the wocind ig rti lviodje cuemo as ai dis re sho ditomi ia tue blis hrdoweo zocmy thome lme fcys erhfewi na knzep mlkroi catu cung rs hrqus vo c pbolyrme si acmm re mife suewo lme ct melis grfo crth ch

In [None]:
cipher_breaker_nstart.extract_best(
    n_extract=10, return_likelihood=True
)  # Extract the best 10 decoded messages along with their likelihood

In [34]:
import numpy as np

print(
    "MCMC accuracy:",
    np.mean(
        np.array(
            list(
                cipher_breaker_nstart.extract_best(n_extract=1, return_likelihood=True)[
                    0
                ][0]
            )
        )
        == np.array(list(text))
    ),
)

MCMC accuracy: 0.47305389221556887


Giustamente lui alla fine va verso uno di due cifrator, e quindi nel momento in cui ne sceglie uno torna che l accuracy sia 0.5.
Da provare le probabilità che l'encoding sia spostato verso un cifratore... MCMC dovrebbe avere accuracy come quella probabilità (o cmq un po piu bassa).


In [None]:
cipher_breaker_nstart.generate_animation()