# Contextual techniques


In [2]:
import itertools
import numpy as np
from math import log2

import adarith
import conarith

def probability_dict(x):
    """
    Produces probability dictionary for to compress file with based on the
    normalised frequencies of each symbol.

    Parameters:
    -----------
    x: dict
    file data

    Returns:
    --------
    p: dict
    Alphabet and corresponding probability
    frequencies: dict
    Alphabet and corresponding frequencies in file data
    """

    frequencies = dict([(key, len(list(group))) for key, group in itertools.groupby(sorted(x))])
    n = sum([frequencies[a] for a in frequencies])
    p = dict([(a, frequencies[a]/n) for a in frequencies])
    return(p, frequencies)

In [7]:
file_name = "hamlet.txt"

filenames = ['hamlet.txt', 'alice29.txt', 'asyoulik.txt', 'lcet10.txt', 'plrabn12.txt']

for file_name in filenames:

    with open(file_name) as file:
        data = file.read()
        
    H_stat = lambda pr: -sum([pr[a]*log2(pr[a]) for a in pr]) # i.i.d entropy

    p, freq = probability_dict(data)
    transition = conarith.transition_matrix(data)

    H = 0                                                     # Markov chain entropy
    for char in p:
        pxy = transition[ord(char)]
        for i in pxy:
            if i== 0:
                continue
            H += p[char]*i*np.log2(1/i)
            
    print("***Properties of file {}: ***".format(file_name))
    print("File size:      {} bytes".format(len(data)))
    print("Static Entropy: {} bits".format(H_stat(p)))
    print("Markov Entropy: {} bits".format(H))

    # Contextual arithmetic
    y, transition, p0 = conarith.encode(data)
    x = conarith.decode(y, transition, p0)

    print("Contextual Arithmetic Compression rate for {}: {} bits/symbol".format(file_name,     len(y)/len(data)))
    print(''.join(x[:200]))
    print('\n\n')
    

***Properties of file hamlet.txt: ***
File size:      207039 bytes
Static Entropy: 4.449863631694343 bits
Markov Entropy: 3.352987113263871 bits
Contextual Arithmetic Compression rate for hamlet.txt: 3.3531556856437676 bits/symbol
        HAMLET


        DRAMATIS PERSONAE


CLAUDIUS        king of Denmark. (KING CLAUDIUS:)

HAMLET  son to the late, and nephew to the present king.

POLONIUS        lord chamberlain. (LORD POLONI



***Properties of file alice29.txt: ***
File size:      148481 bytes
Static Entropy: 4.512876838738921 bits
Markov Entropy: 3.501779988066433 bits
Contextual Arithmetic Compression rate for alice29.txt: 3.5020575023066924 bits/symbol




                ALICE'S ADVENTURES IN WONDERLAND

                          Lewis Carroll

               THE MILLENNIUM FULCRUM EDITION 2.9




                            CHAPTER I

            



***Properties of file asyoulik.txt: ***
File size:      125179 bytes
Static Entropy: 4.808116220349888 bits
Markov Entropy: 3.41