In [1]:
import os
import io
from random import shuffle
import operator

In [62]:
#Global variables
available_languages = [
    "Polish",
    "English"
]
labFolder = "Lab4"
tmpFile = "./tmp/file.txt"

class FrequencyAnalysis:
    """utility class for performing the frequency analysis over a text sample using available languages"""
    #static variables
    sample_postfix = ".txt"
    sequence_len_max = 5
    
    def __init__(self, sample_dir="/text_samples/", sample_lang=None):
        self.sample_lang=sample_lang
        self.sample_dir=sample_dir
        self.sample=None
        self.alphabet=None
        if sample_lang is not None:
            assert sample_lang in available_languages
            self._init_lang_sample()
        self.ciphertext=None
        
    def _init_lang_sample(self):
        self.sample=self._read_sample()
        self.alphabet=sorted(list(set(self.sample)))
        
    def _read_sample(self):
        fname = self.sample_dir+self.sample_lang+FrequencyAnalysis.sample_postfix
        file_path = "../"+labFolder+fname
        return FrequencyAnalysis.formatText(file_path=file_path)
    
    @staticmethod
    def builder():
        return FrequencyAnalysis()
    
    def withCiphertext(self,ciphertext):
        self.ciphertext=ciphertext
        return self
    
    def withLanguage(self,language):
        assert language in available_languages
        self.sample_lang=language
        self._init_lang_sample()
        return self
    
    @staticmethod
    def formatText(remove_nums="T", file_path="../tmp/file.txt"):
        global labFolder
        remove_sym="T"
        table_format="N"
        to_upper="N"
        print_file="N"
        replace_symbols="\"	–śèéá<>©âã£½’«»àæ—…\""
        replacements="\"  seea   aa     a   \""
        f_in=file_path
        f_out="../"+labFolder+"/text_out.txt"
        args=[remove_nums,remove_sym,table_format,to_upper,print_file,replace_symbols,replacements]
        cmd="python ../Lab2/readData.py \"%s\" \"%s\" "%(f_in,f_out)+" ".join(args)
        assert os.system(cmd)==0
        text=None
        with io.open(f_out,mode="r",encoding="utf-8") as f:
            text=f.read()
        return text
    
    @staticmethod
    def computeSymbolFrequencies(text):
        syms=dict()
        for s in text:
            if not s in syms:
                syms[s]=1
            else:
                syms[s]+=1
        return dict((x,syms[x]/len(text)) for x in syms)
    
    @staticmethod
    def computeSequenceFrequencies(text, sequence_len=2, top=10):
        assert sequence_len<FrequencyAnalysis.sequence_len_max
        assert sequence_len<len(text)
        
        i_max = len(text)
        sequences = dict()
        sequence_count = len(text)-sequence_len+1
        
        for i in range(i_max):
            if (i_max-i) > sequence_len:
                sequence = text[i:i+sequence_len]
                if not sequence in sequences:
                    sequences[sequence] = 1
                else:
                    sequences[sequence] += 1
        
        seq_vals = sorted(list(sequences.values()), reverse=True)
        seq_vals = set(seq_vals[:top])
        
        return sorted(((x,sequences[x]/sequence_count) for x in sequences if sequences[x] in seq_vals), \
                      key=operator.itemgetter(1), reverse=True)

    @staticmethod
    def matchValues(dict1,dict2):
        assert len(dict1)==len(dict2)
        _dict1=dict1
        _dict2=dict((x,dict2[x]) for x in dict2)
        matched = []
        
        diff = None
        y_pref = None
        def _match(x,y):
            nonlocal y_pref,diff,_dict1,_dict2
            if diff is None:
                diff = abs(_dict1[x]-_dict2[y])
                y_pref = y
            else:
                diff_next = abs(_dict1[x]-_dict2[y])
                if diff_next<diff:
                    diff=diff_next
                    y_pref=y
        
        for x in dict1:
            for y in dict2:
                _match(x,y)
            assert y_pref is not None
            matched.append((x,y_pref,diff))
            del dict2[y_pref]
            y_pref=None
            diff=None
        
        return matched
        
    
    @staticmethod
    def forSample(sample):
        with open(tmpFile,"w+") as f:
            f.write(sample)
        text = FrequencyAnalysis.formatText(fname=tmpFile)




In [63]:
a = {1:2,3:4}
dict(a)

{1: 2, 3: 4}

In [69]:
fa = FrequencyAnalysis()\
    .withLanguage("English")\
    .withCiphertext("")


In [70]:
fa.computeSequenceFrequencies(fa.sample, sequence_len=3)

[('the', 0.017501141378785574),
 ('and', 0.00790242417718653),
 ('ing', 0.004966389642590689),
 ('tha', 0.0038342903593394477),
 ('her', 0.0038008841509812145),
 ('qui', 0.0037749015444803664),
 ('eth', 0.0035187872804005775),
 ('hat', 0.0033666034423241813),
 ('ver', 0.002976864344811459),
 ('ent', 0.0029583053401679963)]

In [None]:
class MonoalphabeticEncoding:
    def __init__(self, alphabet, alphabet2):
        self.alphabet=alphabet
        self.alphabet2=alphabet2
        assert set(self.alphabet2)==set(self.alphabet) && len(self.alphabet2)==len(self.alphabet)
        
        self.alp=dict((alphabet[i],i) for i in range(len(alphabet)))
        self.alp2=dict((self.alphabet2[i],i) for i in range(len(self.alphabet2)))
    
    def encode(self,symbol):
        return self.alphabet2[self.alp[symbol]]
    
    def decode(self,symbol):
        return self.alphabet[self.alp2[symbol]]
    
    def decodeString(self,string):
        return "".join(map(self.decode,string))
    
    def encodeString(self,string):
        return "".join(map(self.encode,string))


In [3]:
#Copied from Lab3
frequencyEnglish = {
     't': 0.09081222484527532,
     'o': 0.07726025649205641,
     's': 0.06220251387736872,
     'h': 0.06718560581892427,
     'e': 0.12309704587507178,
     'r': 0.05673451158042493,
     'l': 0.03888215402284183,
     'c': 0.02468576532890959,
     'k': 0.008281758438078223,
     'm': 0.02813756141134435,
     'i': 0.06909334524341224,
     'a': 0.0821540228418299,
     'w': 0.026261723983921393,
     'y': 0.02170611880303707,
     'n': 0.06671345626236203,
     'v': 0.01023416065845722,
     'd': 0.042423275697058636,
     'u': 0.030440885599438524,
     'p': 0.015734064952466025,
     'f': 0.021208447648822817,
     'x': 0.0012696994831876474,
     'b': 0.014387800676322338,
     'g': 0.01817775792764627,
     'j': 0.001359025074969693,
     'q': 0.0011101894978625662,
     'z': 0.00044662795891022776
}

frequencyPolish = {
     'l': 0.04965014259221743,
     'i': 0.08535274041982889,
     't': 0.03302217581698328,
     'c': 0.044897068678022095,
     'z': 0.07704654895666131,
     'a': 0.10159111097258801,
     'e': 0.08437095793919182,
     'w': 0.0463931181723262,
     'o': 0.07815300222849039,
     'j': 0.019604481914943354,
     's': 0.05393570103944272,
     'k': 0.03722981501971357,
     'd': 0.03598310710779348,
     'r': 0.044694478642335084,
     'y': 0.03867911296732067,
     'n': 0.04977481338340944,
     'm': 0.030030076828375073,
     'b': 0.016581215228537144,
     'p': 0.02508999672739173,
     'u': 0.02114728295594446,
     'h': 0.011126868113886768,
     'g': 0.014539731022768003,
     'f': 0.0010752855740310742,
     'v': 3.116769779800215e-05
}

In [4]:
text = formatText()

In [5]:
freq = compute_symbol_frequencies(text)

In [6]:
matched = match(frequencyEnglish,freq)
matchedExact = dict((x,matched[x][0]) for x in matched if len(matched[x])==1)
matchedExact

{'e': 'e', 'a': 'n'}

In [7]:
matched

{'t': ['n', 'r'],
 'o': ['a', 'l', 'n', 'p'],
 's': ['a', 'g', 'l', 'p', 't'],
 'h': ['a', 'g', 'l', 'p'],
 'e': ['e'],
 'r': ['g', 'o', 't'],
 'l': ['d', 'h', 'j', 'o', 's', 'u'],
 'c': ['b', 'f', 'i', 'j', 'm', 's', 'u', 'w'],
 'k': ['b', 'c', 'i', 'k', 'q', 'v', 'w', 'x', 'y', 'z'],
 'm': ['f', 'j', 'm', 's', 'u'],
 'i': ['a', 'g', 'l', 'p'],
 'a': ['n'],
 'w': ['b', 'f', 'j', 'm', 's', 'u'],
 'y': ['b', 'f', 'i', 'j', 'm', 's', 'u', 'w', 'y'],
 'n': ['a', 'g', 'l', 'p'],
 'v': ['b', 'c', 'f', 'i', 'k', 'v', 'w', 'x', 'y', 'z'],
 'd': ['d', 'h', 'o'],
 'u': ['j', 'm', 's', 'u'],
 'p': ['b', 'f', 'i', 'v', 'w', 'y'],
 'f': ['b', 'f', 'i', 'j', 'm', 's', 'u', 'w', 'y'],
 'x': ['c', 'k', 'q', 'v', 'x', 'z'],
 'b': ['b', 'f', 'i', 'k', 'v', 'w', 'y'],
 'g': ['b', 'f', 'i', 'v', 'w', 'y'],
 'j': ['c', 'k', 'q', 'v', 'x', 'z'],
 'q': ['c', 'k', 'q', 'v', 'x', 'z'],
 'z': ['c', 'k', 'q', 'v', 'x', 'z']}

In [8]:
matchedFiltered = matchedExact
textPartDec="".join(map(lambda s: matchedFiltered[s] if s in matchedFiltered else "<"+s+">", text))
print(textPartDec[:400])

<s><t><l><b><o>e<m><p>e<o>e<j><r><g><l>n<r><h>e<j><l>n<r>e<p><r><s><t><l><b><o>e<m><p><w><g><o><o><b>e<p>e<o>e<j><r>e<d><b><y><r><h>e<g>n<s>e<t><p><l>n<m>e<m><b>e<t><p><l><f><r><h>e<g>n<r>e<t>n<n><r><g><l>n<n><o><c><u><t><y><l>n<r><h>e<c><u><t><y><m>ee<r><g>n<i><l>n<n><u><i><u><p><r><j><l>n<r>e<p><r><d><n><y><p><r><h>e<r><w><l><j><l>n<r>e<p><r><d><n><y><p><n><t>e<n>n<d><n><u><i><u><p><r><l>ne<n><j


In [31]:
def countSequenceOccurences(text, top=10):
    # 'no' 'as' 'of' 'be' 'me' 'we' 'us' 'or' ...
    diplets = {

    }
    # 'yes' 'and' 'for' 'you' 'try' ... 
    triplets = {

    }
    # 'than' 'then' 'they' 'your' 'fine' 'glad' 'none' 'been' ...
    quadlets = {

    }
    i_max = len(text)
    for i in range(i_max):
        if (i_max-i)>2:
            diplet = text[i:i+2]
            if not diplet in diplets:
                diplets[diplet] = 1
            else:
                diplets[diplet]+=1
        if (i_max-i)>3:
            triplet = text[i:i+3]
            if not triplet in triplets:
                triplets[triplet] = 1
            else:
                triplets[triplet]+=1
        if (i_max-i)>4:
            quadlet = text[i:i+4]
            if not quadlet in quadlets:
                quadlets[quadlet] = 1
            else:
                quadlets[quadlet]+=1
    
    dip_vals = sorted(list(diplets.values()), reverse=True)
    dip_vals = set(dip_vals[:top])
    
    
    trip_vals = sorted(list(triplets.values()), reverse=True)
    trip_vals = set(trip_vals[:top])
    
    quad_vals = sorted(list(quadlets.values()), reverse=True)
    quad_vals = set(quad_vals[:top])
    
    s1 = sum(diplets[x] for x in diplets if diplets[x] in dip_vals)
    s2 = sum(triplets[x] for x in triplets if triplets[x] in trip_vals)
    s3 = sum(quadlets[x] for x in quadlets if quadlets[x] in quad_vals)
    
    dip_out = sorted((x,diplets[x]/s1) for x in diplets if diplets[x] in dip_vals)
    trip_out = sorted((x,triplets[x]/s2) for x in triplets if triplets[x] in trip_vals)
    quad_out = sorted((x,quadlets[x]/s3) for x in quadlets if quadlets[x] in quad_vals)
    
    assert round(sum(list(zip(dip_out))[1]),3)==1
    assert round(sum(trip_out.values()),3)==1
    assert round(sum(quad_out.values()),3)==1
    
    return dip_out,trip_out,quad_out
    


In [32]:
def decode(text,matched, replace=False):
    return "".join(map(lambda s: matched[s] if s in matched else (" <"+s+"> " if not replace else "-"), text))

print(decode(text,matchedFiltered, False)[:500])

 <s>  <t>  <l>  <b>  <o> e <m>  <p> e <o> e <j>  <r>  <g>  <l> n <r>  <h> e <j>  <l> n <r> e <p>  <r>  <s>  <t>  <l>  <b>  <o> e <m>  <p>  <w>  <g>  <o>  <o>  <b> e <p> e <o> e <j>  <r> e <d>  <b>  <y>  <r>  <h> e <g> n <s> e <t>  <p>  <l> n <m> e <m>  <b> e <t>  <p>  <l>  <f>  <r>  <h> e <g> n <r> e <t> n <n>  <r>  <g>  <l> n <n>  <o>  <c>  <u>  <t>  <y>  <l> n <r>  <h> e <c>  <u>  <t>  <y>  <m> ee <r>  <g> n <i>  <l> n <n>  <u>  <i>  <u>  <p>  <r>  <j>  <l> n <r> e <p>  <r>  <d>  <n>  <y>  <p>


In [33]:
dip,trip,quad = countSequenceOccurences(text, top=15)

TypeError: unsupported operand type(s) for +: 'int' and 'tuple'

In [24]:
print(matchedExact)
print(dip)
print(trip)

{'e': 'e', 'a': 'n'}
{'la': 0.07240437158469945, 'ar': 0.06284153005464481, 'rh': 0.10792349726775956, 'he': 0.0942622950819672, 're': 0.05737704918032787, 'ep': 0.07377049180327869, 'pr': 0.06967213114754098, 'ga': 0.08060109289617487, 'et': 0.06147540983606557, 'er': 0.05327868852459016, 'te': 0.04918032786885246, 'en': 0.0546448087431694, 'na': 0.06420765027322405, 'de': 0.04918032786885246, 'nm': 0.04918032786885246}
{'stl': 0.046153846153846156, 'rgl': 0.043076923076923075, 'gla': 0.046153846153846156, 'lar': 0.043076923076923075, 'rhe': 0.2, 'epr': 0.07384615384615385, 'etp': 0.043076923076923075, 'gai': 0.05846153846153846, 'nad': 0.08615384615384615, 'hep': 0.043076923076923075, 'pru': 0.04923076923076923, 'rud': 0.04923076923076923, 'ude': 0.04923076923076923, 'dea': 0.055384615384615386, 'ear': 0.06153846153846154, 'erh': 0.052307692307692305}


In [None]:
#'la': 53, a->n => l->a? (la = an)

# 'rhe': 65 r->t, h->h (rhe = the)

# anthe <j> -> anthem? j->m

# 'dea': 18, a->n, e->e => d->t? (dea = ten)



# 'ga': 59 => g->r? (ga = rn)
# 'rh': 79 => h->o? (rh = do)

In [None]:
matchedFiltered = {
    'e': 'e',
    'a': 'n',
    'r': 't',
    'h': 'h',
}
decode(text,matchedFiltered,False)