In [3]:
import os
import io
from random import shuffle
import operator
from functools import reduce
import numpy as np
from math import sqrt

In [4]:
class MonoalphabeticEncoding:
    def __init__(self, alphabet, alphabet2):
        self.alphabet=alphabet
        self.alphabet2=alphabet2
        assert set(self.alphabet2)==set(self.alphabet) and len(self.alphabet2)==len(self.alphabet)
        
        self.alp=dict((alphabet[i],i) for i in range(len(alphabet)))
        self.alp2=dict((self.alphabet2[i],i) for i in range(len(self.alphabet2)))
    
    def encode(self,symbol):
        return self.alphabet2[self.alp[symbol]]
    
    def decode(self,symbol):
        return self.alphabet[self.alp2[symbol]]
    
    def decodeString(self,string):
        return "".join(map(self.decode,string))
    
    def encodeString(self,string):
        return "".join(map(self.encode,string))


In [5]:
#Global variables
available_languages = [
    "Polish",
    "English"
]
labFolder = "Lab4"
tmpFile = "../tmp/file.txt"

class FrequencyAnalysis:
    """utility class for performing the frequency analysis over a text sample using available languages"""
    #static variables
    sample_postfix = ".txt"
    sequence_len_max = 1
    
    def __init__(self, sample_dir="/text_samples/", sample_lang=None):
        self.sample_lang=sample_lang
        self.sample_dir=sample_dir
        self.sample=None
        self.alphabet=None
        if sample_lang is not None:
            assert sample_lang in available_languages
            self._init_lang_sample()
        self.ciphertext=None
        
    def _init_lang_sample(self):
        self.sample=self._read_sample()
        self.alphabet=sorted(list(set(self.sample)))
        
    def _read_sample(self):
        fname = self.sample_dir+self.sample_lang+FrequencyAnalysis.sample_postfix
        file_path = "../"+labFolder+fname
        return FrequencyAnalysis.formatText(file_path=file_path)
    
    def forCiphertext(self,ciphertext):
        self.ciphertext=ciphertext
        return self
    
    def forLanguage(self,language):
        assert language in available_languages
        self.sample_lang=language
        self._init_lang_sample()
        return self
    
    @staticmethod
    def formatText(remove_nums="T", file_path="../tmp/file.txt"):
        global labFolder
        remove_sym="T"
        table_format="N"
        to_upper="N"
        print_file="N"
        replace_symbols="\"	–śèéá<>©âã£½’«»àæ—…\""
        replacements="\"  seea   aa     a   \""
        f_in=file_path
        f_out="../"+labFolder+"/text_out.txt"
        args=[remove_nums,remove_sym,table_format,to_upper,print_file,replace_symbols,replacements]
        cmd="python ../Lab2/readData.py \"%s\" \"%s\" "%(f_in,f_out)+" ".join(args)
        assert os.system(cmd)==0
        text=None
        with io.open(f_out,mode="r",encoding="utf-8") as f:
            text=f.read()
        return text
    
    @staticmethod
    def computeSymbolFrequencies(text):
        syms=dict()
        for s in text:
            if not s in syms:
                syms[s]=1
            else:
                syms[s]+=1
        
        assert 'x' in syms
        s = sorted(list(syms.keys()))
        assert 'x' in s
        return dict((x,syms[x]/len(text)) for x in s)

    @staticmethod
    def matchValues(list1,list2):
        _dict1=dict(list1)
        _dict2=dict(list2)
        matched = []
        
        diff = None
        y_pref = None
        def _match(x,y):
            nonlocal y_pref,diff,_dict1,_dict2
            if diff is None:
                diff = abs(_dict1[x]-_dict2[y])
                y_pref = y
            else:
                diff_next = abs(_dict1[x]-_dict2[y])
                if diff_next<diff:
                    diff=diff_next
                    y_pref=y
        
        for x in _dict1:
            for y in _dict2:
                _match(x,y)
            if y_pref is None:
                break
            matched.append((x,y_pref.upper(),diff))
            del _dict2[y_pref]
            y_pref=None
            diff=None
        
        return matched
    
    
    @staticmethod
    def suggestLanguageForSample(sample):
        with open(tmpFile,"w+") as f:
            f.write(sample)
        text = FrequencyAnalysis.formatText(file_path=tmpFile)
        min_error = None
        language_pref = None
        
        for language in available_languages:
            fa = FrequencyAnalysis().forLanguage(language)
            matched = FrequencyAnalysis.matchValues(FrequencyAnalysis.computeSymbolFrequencies(fa.sample),\
                                                   FrequencyAnalysis.computeSymbolFrequencies(text))
            error = sum(x[2] for x in matched)
            if min_error is None:
                min_error=error
                language_pref=language
                continue
            else:
                if error<min_error:
                    min_error=error
                    language_pref=language
        
        print("Suggested language: "+language_pref)
        return language_pref


In [6]:
FrequencyAnalysis.formatText(file_path="./text.txt")


'ludzieztwojejplanetypowiedzialmalyksiazehodujapiectysiecyrozwjednymogrodzieinieznajdujawnichtegoczegoszukajaatymczasemtoczegoszukajamozebycukrytewjednejrozylubwodrobiniewodyleczoczysaslepeszukacnalezysercem'

In [426]:
ciphertext_path = "../"+labFolder+"/2.txt"
text = FrequencyAnalysis.formatText(remove_nums='T',file_path=ciphertext_path)
print("Ciphertext (reformatted to lowercase):")
print()
print(text[:500])

Ciphertext (reformatted to lowercase):

stlboempeoejrglarhejlareprstlboempwgoobepeoejredbyrhegasetplamembetplfrhegaretanrglanocutylarhecutymeergailanuiuprjlareprdnyprherwljlareprdnypntenadnuiuprlaenjhdnyrhetewgoobestlboempflthlutpenjhdnyrheprudearpmuprwtgrerhegtplourglaplanltupoerretsnsetplaoyrheprnadntddtnwgairllopseapprtngihredieperjnadstgareddgjrglantgepntenoolwedupgainayeoejrtlagjrlloltstgaredmnretgnogpalrnoolwedrhepjtgsrpphluodalrjlarnganayrexranmeltmntkftlmwhgjhrheprudearjnabegdeargfgedrheonaiunielfrhejlareprgpeaiogphplourglapwt


In [431]:
suggestedLanguage = FrequencyAnalysis.suggestLanguageForSample(text)
suggestedLanguage

Suggested language: English


'English'

In [432]:
fa = FrequencyAnalysis().forLanguage(suggestedLanguage)

eng_freq = fa.computeSymbolFrequencies(fa.sample)
freq = FrequencyAnalysis.computeSymbolFrequencies(text)

In [433]:
key = dict()

In [434]:
def addLetter(letter,decoded_letter):
    key[letter]=decoded_letter.upper()
    try:
        del freq[decoded_letter]
        del eng_freq[letter]
    except: pass
    
def partlyDecode(text,possibleKey,length=200):
    return "".join(list(map(lambda x: possibleKey[x].upper() if x in possibleKey else x,text)))[:length]

In [441]:
print("Ciphertext frequency analysis: \n%s"%freq)
print(end="\n\n")
print("Suggested language frequency analysis (%s): \n%s"%(suggestedLanguage,eng_freq))

Ciphertext frequency analysis: 
{'a': 0.07082913729439409, 'b': 0.017119838872104734, 'c': 0.001342732460557234, 'd': 0.044310171198388724, 'e': 0.12789526686807653, 'f': 0.018798254447801276, 'g': 0.06579389056730446, 'h': 0.04296743873783149, 'i': 0.0157771064115475, 'j': 0.02920443101711984, 'k': 0.005035246727089627, 'l': 0.07082913729439409, 'm': 0.028197381671701913, 'n': 0.08526351124538435, 'o': 0.04699563611950319, 'p': 0.0711648204095334, 'r': 0.10070493454179255, 's': 0.030211480362537766, 't': 0.05572339711312521, 'u': 0.029875797247398455, 'v': 0.008392077878482713, 'w': 0.016112789526686808, 'x': 0.0036925142665323934, 'y': 0.011748909029875798, 'z': 0.002014098690835851}


Suggested language frequency analysis (English): 
{'a': 0.07977343335325318, 'b': 0.018521749136084748, 'c': 0.02832083084335203, 'd': 0.03576293645815161, 'e': 0.12129333031442432, 'f': 0.02352893141756337, 'g': 0.022994436051712426, 'h': 0.0493443152334891, 'i': 0.07039378203724393, 'j': 0.0052967006

In [374]:
#Initial assumptions based on the frequency analysis
addLetter('e','e')
addLetter('b','b')
addLetter('m','m')
addLetter('x','x')

In [375]:
partlyDecode(text,key,length=100)

'stlBoEMpEoEjrglarhEjlarEprstlBoEMpwgooBEpEoEjrEdByrhEgasEtplaMEMBEtplfrhEgarEtanrglanocutylarhEcutyM'

In [376]:
#Assuming MEMBEt = MEMBER
addLetter('t','r')
partlyDecode(text,key,length=110)

'sRlBoEMpEoEjrglarhEjlarEprsRlBoEMpwgooBEpEoEjrEdByrhEgasERplaMEMBERplfrhEgarERanrglanocuRylarhEcuRyMEErgailanu'

In [377]:
#Assuming MEEr=MEET
addLetter('r','t')
partlyDecode(text,key,length=100)

'sRlBoEMpEoEjTglaThEjlaTEpTsRlBoEMpwgooBEpEoEjTEdByThEgasERplaMEMBERplfThEgaTERanTglanocuRylaThEcuRyM'

In [378]:
#Assuming ThE=THE
addLetter('h','h')
partlyDecode(text,key,length=230)

'sRlBoEMpEoEjTglaTHEjlaTEpTsRlBoEMpwgooBEpEoEjTEdByTHEgasERplaMEMBERplfTHEgaTERanTglanocuRylaTHEcuRyMEETgailanuiupTjlaTEpTdnypTHETwljlaTEpTdnypnREnadnuiupTlaEnjHdnyTHEREwgooBEsRlBoEMpflRHluRpEnjHdnyTHEpTudEaTpMupTwRgTETHEgRplouTgla'

In [379]:
#Assuming THEgR = THEIR
addLetter('g','i')
partlyDecode(text,key,length=610)

'sRlBoEMpEoEjTIlaTHEjlaTEpTsRlBoEMpwIooBEpEoEjTEdByTHEIasERplaMEMBERplfTHEIaTERanTIlanocuRylaTHEcuRyMEETIailanuiupTjlaTEpTdnypTHETwljlaTEpTdnypnREnadnuiupTlaEnjHdnyTHEREwIooBEsRlBoEMpflRHluRpEnjHdnyTHEpTudEaTpMupTwRITETHEIRplouTIlaplanlRupoETTERsnsERplaoyTHEpTnadnRddRnwIaiTllopsEappTRnIiHTEdiEpETjnadsRIaTEddIjTIlanRIEpnREnoolwEdupIainayEoEjTRlaIjTllolRsRIaTEdMnTERInoIpalTnoolwEdTHEpjRIsTppHluodalTjlaTnIanayTEXTanMElRMnRkfRlMwHIjHTHEpTudEaTjnaBEIdEaTIfIEdTHEonaiuniElfTHEjlaTEpTIpEaioIpHplouTIlapwRITTEaIalTHERonaiuniEpwIooalTBEEvnounTEdBlTHjlaTEpTdnypTHEREwIooBETHREEEXnMpIasERplajlMsETITlRpHnvETHEEXnMnTTHE'

In [380]:
#Assuming EXnM = EXAM
addLetter('n','a')
partlyDecode(text,key,length=600)

'sRlBoEMpEoEjTIlaTHEjlaTEpTsRlBoEMpwIooBEpEoEjTEdByTHEIasERplaMEMBERplfTHEIaTERaATIlaAocuRylaTHEcuRyMEETIailaAuiupTjlaTEpTdAypTHETwljlaTEpTdAypAREAadAuiupTlaEAjHdAyTHEREwIooBEsRlBoEMpflRHluRpEAjHdAyTHEpTudEaTpMupTwRITETHEIRplouTIlaplaAlRupoETTERsAsERplaoyTHEpTAadARddRAwIaiTllopsEappTRAIiHTEdiEpETjAadsRIaTEddIjTIlaARIEpAREAoolwEdupIaiAayEoEjTRlaIjTllolRsRIaTEdMATERIAoIpalTAoolwEdTHEpjRIsTppHluodalTjlaTAIaAayTEXTaAMElRMARkfRlMwHIjHTHEpTudEaTjAaBEIdEaTIfIEdTHEoAaiuAiElfTHEjlaTEpTIpEaioIpHplouTIlapwRITTEaIalTHERoAaiuAiEpwIooalTBEEvAouATEdBlTHjlaTEpTdAypTHEREwIooBETHREEEXAMpIasERplajlMsETITlRpHAvETH'

In [381]:
#Assuming HAvE = HAVE
addLetter('v','v')
partlyDecode(text,key,length=665)

'sRlBoEMpEoEjTIlaTHEjlaTEpTsRlBoEMpwIooBEpEoEjTEdByTHEIasERplaMEMBERplfTHEIaTERaATIlaAocuRylaTHEcuRyMEETIailaAuiupTjlaTEpTdAypTHETwljlaTEpTdAypAREAadAuiupTlaEAjHdAyTHEREwIooBEsRlBoEMpflRHluRpEAjHdAyTHEpTudEaTpMupTwRITETHEIRplouTIlaplaAlRupoETTERsAsERplaoyTHEpTAadARddRAwIaiTllopsEappTRAIiHTEdiEpETjAadsRIaTEddIjTIlaARIEpAREAoolwEdupIaiAayEoEjTRlaIjTllolRsRIaTEdMATERIAoIpalTAoolwEdTHEpjRIsTppHluodalTjlaTAIaAayTEXTaAMElRMARkfRlMwHIjHTHEpTudEaTjAaBEIdEaTIfIEdTHEoAaiuAiElfTHEjlaTEpTIpEaioIpHplouTIlapwRITTEaIalTHERoAaiuAiEpwIooalTBEEVAouATEdBlTHjlaTEpTdAypTHEREwIooBETHREEEXAMpIasERplajlMsETITlRpHAVETHEEXAMATTHEAuBijAMsupIaBoAilEViRAdfRlMTlplfIATIMETHEEXAMpTARTpATla'

In [384]:
#Assuming pTARTpAT = STARTSAT
addLetter('p','S')
partlyDecode(text,key,length=700)

'sRlBoEMSEoEjTIlaTHEjlaTESTsRlBoEMSwIooBESEoEjTEdByTHEIasERSlaMEMBERSlfTHEIaTERaATIlaAocuRylaTHEcuRyMEETIailaAuiuSTjlaTESTdAySTHETwljlaTESTdAySAREAadAuiuSTlaEAjHdAyTHEREwIooBEsRlBoEMSflRHluRSEAjHdAyTHESTudEaTSMuSTwRITETHEIRSlouTIlaSlaAlRuSoETTERsAsERSlaoyTHESTAadARddRAwIaiTlloSsEaSSTRAIiHTEdiESETjAadsRIaTEddIjTIlaARIESAREAoolwEduSIaiAayEoEjTRlaIjTllolRsRIaTEdMATERIAoISalTAoolwEdTHESjRIsTSSHluodalTjlaTAIaAayTEXTaAMElRMARkfRlMwHIjHTHESTudEaTjAaBEIdEaTIfIEdTHEoAaiuAiElfTHEjlaTESTISEaioISHSlouTIlaSwRITTEaIalTHERoAaiuAiESwIooalTBEEVAouATEdBlTHjlaTESTdAySTHEREwIooBETHREEEXAMSIasERSlajlMsETITlRSHAVETHEEXAMATTHEAuBijAMsuSIaBoAilEViRAdfRlMTlSlfIATIMETHEEXAMSTARTSATlaoIaESTudEaTSAadTEAMSfRlMEuRlsEASIAA'

In [385]:
#Assuming MuST=MUST, wROTE=WROTE

addLetter('u','U')
addLetter('w','W')
partlyDecode(text,key,length=215)

'sRlBoEMSEoEjTIlaTHEjlaTESTsRlBoEMSWIooBESEoEjTEdByTHEIasERSlaMEMBERSlfTHEIaTERaATIlaAocURylaTHEcURyMEETIailaAUiUSTjlaTESTdAySTHETWljlaTESTdAySAREAadAUiUSTlaEAjHdAyTHEREWIooBEsRlBoEMSflRHlURSEAjHdAyTHESTUdEaTSMUSTWRI'

In [386]:
#Assuming STUdEaTS = STUDENTS
addLetter('d','D')
addLetter('a','n')
partlyDecode(text,key,length=235)

'sRlBoEMSEoEjTIlNTHEjlNTESTsRlBoEMSWIooBESEoEjTEDByTHEINsERSlNMEMBERSlfTHEINTERNATIlNAocURylNTHEcURyMEETINilNAUiUSTjlNTESTDAySTHETWljlNTESTDAySAREANDAUiUSTlNEAjHDAyTHEREWIooBEsRlBoEMSflRHlURSEAjHDAyTHESTUDENTSMUSTWRITETHEIRSloUTIlNSlNAl'

In [389]:
#Assuming THESTUDENTSMUSTWRITETHEIRSloUTIlNS = THESTUDENTSMUSTWRITETHEIRSOLUTIONS
addLetter('l','o')
addLetter('o','l')
partlyDecode(text,key,length=50)

'sROBLEMSELEjTIONTHEjONTESTsROBLEMSWILLBESELEjTEDBy'

In [391]:
#Assuming sROBLEMS = PROBLEMS
addLetter('s','p')
partlyDecode(text,key,length=50)

'PROBLEMSELEjTIONTHEjONTESTPROBLEMSWILLBESELEjTEDBy'

In [400]:
#Assuming SELEjTION=SELECTION, jONTEST=CONTEST
addLetter('j','c')
partlyDecode(text,key,length=110)

'PROBLEMSELECTIONTHECONTESTPROBLEMSWILLBESELECTEDByTHEINPERSONMEMBERSOfTHEINTERNATIONALcURyONTHEcURyMEETINiONAU'

In [401]:
#Assuming ByTHE = BYTHE
#Assuming INTERNATIONALcURy = INTERNATIONALJURY
#Assuming MEETINi=MEETING
addLetter('c','j')
addLetter('y','Y')
addLetter('i','G')
partlyDecode(text,key,length=70)

'PROBLEMSELECTIONTHECONTESTPROBLEMSWILLBESELECTEDBYTHEINPERSONMEMBERSOf'

In [404]:
#Assuming MEMBERSOf=MEMBERSOF
addLetter('f','F')
partlyDecode(text,key,length=430)

'PROBLEMSELECTIONTHECONTESTPROBLEMSWILLBESELECTEDBYTHEINPERSONMEMBERSOFTHEINTERNATIONALJURYONTHEJURYMEETINGONAUGUSTCONTESTDAYSTHETWOCONTESTDAYSAREANDAUGUSTONEACHDAYTHEREWILLBEPROBLEMSFORHOURSEACHDAYTHESTUDENTSMUSTWRITETHEIRSOLUTIONSONAORUSLETTERPAPERSONLYTHESTANDARDDRAWINGTOOLSPENSSTRAIGHTEDGESETCANDPRINTEDDICTIONARIESAREALLOWEDUSINGANYELECTRONICTOOLORPRINTEDMATERIALISNOTALLOWEDTHESCRIPTSSHOULDNOTCONTAINANYTEXTNAMEORMARkFROMWHI'

In [412]:
#Assuming MARk = MARK
addLetter('k','K')
partlyDecode(text,key,length=1075)

'PROBLEMSELECTIONTHECONTESTPROBLEMSWILLBESELECTEDBYTHEINPERSONMEMBERSOFTHEINTERNATIONALJURYONTHEJURYMEETINGONAUGUSTCONTESTDAYSTHETWOCONTESTDAYSAREANDAUGUSTONEACHDAYTHEREWILLBEPROBLEMSFORHOURSEACHDAYTHESTUDENTSMUSTWRITETHEIRSOLUTIONSONAORUSLETTERPAPERSONLYTHESTANDARDDRAWINGTOOLSPENSSTRAIGHTEDGESETCANDPRINTEDDICTIONARIESAREALLOWEDUSINGANYELECTRONICTOOLORPRINTEDMATERIALISNOTALLOWEDTHESCRIPTSSHOULDNOTCONTAINANYTEXTNAMEORMARKFROMWHICHTHESTUDENTCANBEIDENTIFIEDTHELANGUAGEOFTHECONTESTISENGLISHSOLUTIONSWRITTENINOTHERLANGUAGESWILLNOTBEEVALUATEDBOTHCONTESTDAYSTHEREWILLBETHREEEXAMSINPERSONCOMPETITORSHAVETHEEXAMATTHEAUBGCAMPUSINBLAGOEVGRADFROMTOSOFIATIMETHEEXAMSTARTSATONLINESTUDENTSANDTEAMSFROMEUROPEASIAANDAFRICAMUSTTAKETHEEXAMBETWEENAMANDAMUTCSAMEASTHEINPERSONEXAMONLINESTUDENTSANDTEAMSFROMAMERICAMUSTTAKETHEEXAMBETWEENPMANDPMUTCALLMEMBERSOFEACHTEAMSHOULDSTARTATTHESAMETIMEWITHDOWNLOADINGANDPRINTINGTHEPROBLEMSOPENINGANDCLOSINGCEREMONYTHEOPENINGANDCLOSINGCEREMONIESWILLBEHELDINBLAGOEVGRADWEPLANTORECORD

In [414]:
#Assuming ORGANIzE = ORGANIZE

addLetter('z','z')
decoded = partlyDecode(text,key,length=-1)

#Check if the text contains lowercase (undecoded) letters 
if all(x==x.upper() for x in decoded):
    print('All letters have been decoded')

All letters have been decoded


In [415]:
key = dict((x,key[x]) for x in sorted(key.keys(),key=operator.itemgetter(0)))

In [417]:
print("key = %s"%key)

key = {'a': 'N', 'b': 'B', 'c': 'J', 'd': 'D', 'e': 'E', 'f': 'F', 'g': 'I', 'h': 'H', 'i': 'G', 'j': 'C', 'k': 'K', 'l': 'O', 'm': 'M', 'n': 'A', 'o': 'L', 'p': 'S', 'r': 'T', 's': 'P', 't': 'R', 'u': 'U', 'v': 'V', 'w': 'W', 'x': 'X', 'y': 'Y', 'z': 'Z'}
