# Source Coding

In [1]:
from filecmp import cmp
from os import stat
from json import load
from math import log2
from camzip import camzip
from camunzip import camunzip

In [2]:
iid_methods = ['shannon_fano', 'huffman', 'arithmetic']
filenames = ['hamlet.txt', 'alice29.txt', 'asyoulik.txt', 'lcet10.txt', 'plrabn12.txt']
path = 'data/'

In [3]:
H = lambda pr: -sum([pr[a]*log2(pr[a]) for a in pr])

def analyze_compression(file_path, method):
    Nin = stat(file_path).st_size
    print(f'Length of original file: {Nin} bytes')
    Nout = stat(file_path + '.cz' + method[0]).st_size
    print(f'Length of compressed file: {Nout} bytes')
    print(f'Compression rate: {8.0*Nout/Nin} bits/byte')
    with open(file_path + '.czp', 'r') as fp:
        freq = load(fp)
    pf = dict([(a, freq[a]/Nin) for a in freq])
    print(f'Entropy: {H(pf)} bits per symbol')
    if cmp(file_path,file_path+'.cuz'):
        print('The two files are the same')
    else:
        print('The files are different')

## Coding of i.i.d. Sources

In [5]:
for filename in filenames:
    print(f'\n\nCompressing file {filename}')
    for method in iid_methods:
        print(f'\nCompressing using {method}')
        file_path = path + filename
        camzip(method, file_path)
        camunzip(file_path + '.cz' + method[0])
        analyze_compression(file_path, method)
    print()
print('Done!')



Compressing file hamlet.txt

Compressing using shannon_fano
Length of original file: 207039 bytes
Length of compressed file: 124694 bytes
Compression rate: 4.818184013639942 bits/byte
Entropy: 4.449863631694343 bits per symbol
The two files are the same

Compressing using huffman
Length of original file: 207039 bytes
Length of compressed file: 115752 bytes
Compression rate: 4.47266457044325 bits/byte
Entropy: 4.449863631694343 bits per symbol
The two files are the same

Compressing using arithmetic
Length of original file: 207039 bytes
Length of compressed file: 115163 bytes
Compression rate: 4.449905573346085 bits/byte
Entropy: 4.449863631694343 bits per symbol
The two files are the same



Compressing file alice29.txt

Compressing using shannon_fano
Length of original file: 152089 bytes
Length of compressed file: 97103 bytes
Compression rate: 5.107693521556457 bits/byte
Entropy: 4.567680212177265 bits per symbol
The two files are the same

Compressing using huffman
Length of origin