In [48]:
import matplotlib.pyplot as plt
import numpy as np

from src.CipherBreaker import CipherBreaker
from src.CipherUtils import (
    TextDecoder,
    TextEncoder,
    CipherGenerator,
    TextPreProcessor,
)
from src.ProbabilityMatrix import ProbabilityMatrix

from difflib import SequenceMatcher

In [50]:
def similar(a, b):
    """
    Given two strings a, b it returns a percentage of matching characters among the two
    """

    return SequenceMatcher(None, a.replace(" ", ""), b.replace(" ", "")).ratio() 

In [51]:
# Initialize classes and obtain the matrix of probabilities by training on the provided text
cipher_generator = CipherGenerator()
preprocess = TextPreProcessor()
text_encoder = TextEncoder()
text_decoder = TextDecoder()


# List of text file paths to build our corpus (where we learn the transitions probs)
file_paths = [
    "texts/moby_dick.txt",
    "texts/shakespeare.txt",
    "texts/james-joyce-a-portrait-of-the-artist-as-a-young-man.txt",
    "texts/james-joyce-dubliners.txt",
    "texts/james-joyce-ulysses.txt",
]

texts = []
for file_path in file_paths:
    with open(file_path, "r") as file:
        texts.append(file.read())

corpus = "".join(texts)


# preprocess the text, removing extra-characters

corpus = preprocess.lower(corpus)
unknown_chars = preprocess.unknown_chars(corpus)
# print(unknown_chars)
corpus = preprocess.remove_unknown_chars(corpus, unknown_chars=unknown_chars)
corpus = preprocess.remove_additional_spaces(corpus)

preprocess.save_text(
    corpus
)  # save text after preprocessing inside text_preprocessed.txt


# compute the transition probs
probability_matrix = ProbabilityMatrix(corpus)
probability_matrix.compute_probability_table()

probability_matrix.save_all_2_chars()
probability_matrix.save_probability_table()

In [52]:
lengths = [2**(i+3) for i in range(8)]
lengths

[8, 16, 32, 64, 128, 256, 512, 1024]

In [53]:
# Obtain text of varying length from one of the texts we trained one I choose Moby Dick

with open("texts/moby_dick.txt", 'r') as input_file:
    full_text = input_file.read()

full_text = preprocess.lower(full_text)
unknown_chars = preprocess.unknown_chars(full_text)
# print(unknown_chars)
full_text = preprocess.remove_unknown_chars(full_text, unknown_chars=unknown_chars)
full_text = preprocess.remove_additional_spaces(full_text)


moby_dick_subtexts = [" ".join(full_text.split()[:lengths[i]]) for i in range(len(lengths))] # Contains the ones for the varying lengths

In [54]:
moby_dick_subtexts

['chapter loomings call me ishmael some years ago',
 'chapter loomings call me ishmael some years ago never mind how long precisely having little or',
 'chapter loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i thought i',
 'chapter loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i thought i would sail about a little and see the watery part of the world it is a way i have of driving off the spleen and regulating the circulation whenever i find myself',
 'chapter loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i thought i would sail about a little and see the watery part of the world it is a way i have of driving off the spleen and regulating the circulation whenever i fi

In [66]:
n_iterations = 10 # Number of times we want to encode and decode each string
max_iterations = 10000 # Number of starting iterations inside the codebreaker
n_start = 3 # Number of starting iterations inside the codebreaker

n_top_likelihood = 5 # Number of top likelihoods to consider to see if the string matched.

mean_accuracy = []

for subtext in moby_dick_subtexts:


    total_iterations = 0
    for i in range(n_iterations):
        cipher = cipher_generator.generate_cipher()
        encoded_text = text_encoder.encode_text(subtext, cipher)

        # Initialize a code breaker for that encoded message
        cipher_breaker_nstart = CipherBreaker(
            cipher_generator = cipher_generator, 
            ciphered_text=encoded_text,
            probability_table=probability_matrix.probability_table,
        )
        
        # Break the code and store the n_top_likelihood in a dictionary (avoid printing)
        cipher_breaker_nstart.break_cipher_nstart(iterations=max_iterations, print_interval=max_iterations, nstart = n_start)
        best_dict = cipher_breaker_nstart.extract_best(
                        n_extract=n_top_likelihood, return_likelihood=True
                    )  

        # Extract only the text and store it in a list
        best_dict_text = [t[0] for t in best_dict]
        print(best_dict_text)

        # Now we check the percentage of correct ones in the string we matched the most with
        total_iterations = total_iterations + max([ similar(subtext, best_dict_text[l]) for l in range(len(best_dict_text)) ])
    

    mean_accuracy.append(total_iterations/n_iterations)

Iter 0 of start 1: zaomgje kddwqhbf zokk wj qfawojk fdwj rjoef obd
Iter 0 of start 2: rgdplyx euuvknjc rdee vy kcgvdye cuvy sydxc dju
Iter 0 of start 3: glhovec qzzuptfm ghqq ue pmluheq mzue kehcm hfz
['urspqth ollandie usoo at nerasto elat ctshe sil', 'antwqsh ollicure atoo is cenitso elis dsthe trl', 'hricond tssamqle hitt an meraint esan wnide ils', 'prachen siitould pass te odrtaes dite meand ali', 'hricond tssamule hitt an meraint esan wnide ils']
Iter 0 of start 1: wudycob appfzemv wdaa fo zvufdoa vpfo jodbv dmp
Iter 0 of start 2: xzyqcvh essonbik xyee ov nkzoyve ksov fvyhk yis
Iter 0 of start 3: jsfliqx poovkrnt jfpp vq ktsvfqp tovq hqfxt fno
['waifret soongqch wiss ne ghanies hone beith ico', 'waipret soongqch wiss ne ghanies hone meith ico', 'waidret soongqch wiss ne ghanies hone beith ico', 'waifvet soongqch wiss ne ghanies hone beith ico', 'wailyet soongqch wiss ne ghanies hone meith ico']
Iter 0 of start 1: exrpzkw gyyhjqfs ergg hk jsxhrkg syhk ukrws rfy
Iter 0 of start 2: 

OverflowError: math range error

In [67]:
mean_accuracy # Ok overflow occured though on longer strings so avoid that

[0.325,
 0.4113924050632911,
 0.8027972027972028,
 0.9065934065934066,
 0.9964102564102564,
 0.9458699472759227]