# Kompresja danych
## 1. Kodowanie o stałej długości słowa

- **Jaka jest najkrótsza możliwa długość takiego kodu dla korpusu z dzi-siejszych zajęć?**

Tekst zawiera 37 znaków, co za tym idzie wystarczy z nadmiarem kod o długości 6. Pozwoli to na zakodowanie maksymalnie 2^6 = 64

- **Ile wyniesie stopień kompresji w tym kodzie?**

Stopień kompresji: 1 - 6/8 = 1/4. Plik powinien być mniejszy o 1/4.

In [1]:
from bitarray import bitarray
import operator
import pickle
import os
import networkx as nx
from collections import defaultdict
from more_itertools import pairwise
from collections import Counter
from math import ceil, log2

In [2]:
def load_file(filename):
    with open(filename) as f:
        return f.read()

    
TEST_TEXT_NAME = "text_test.pkl"
TEST_CODE_NAME = "code_test.pkl"

In [3]:
# Ogólny interfejs dla wszystkich metod kodowania

class Coder():    
    @staticmethod
    def save(text_enc, code, ftname, fcname):
        with open(ftname, "wb") as ft, open(fcname, "wb") as fc:
            pickle.dump(text_enc, ft)
            pickle.dump(code, fc) 
            
    @staticmethod
    def load(ftname, fcname):
        with open(ftname, "rb") as ft, open(fcname, "rb") as fc:
            text_enc = pickle.load(ft) 
            code = pickle.load(fc)
            
        return text_enc, code

In [4]:
# Kodowanie o stałej długości słowa

class ConstLenCoder(Coder):
    @staticmethod
    def encode(text, code):
        text_bin = [code[letter] for letter in text]
        text_bin = "".join(text_bin)
        text_bin = bitarray(text_bin)
        return text_bin
    
    @staticmethod
    def decode(text_bin, code):
        code_len = len(list(code.values())[0])
        num2letter = {n: l for l, n in code.items()}
        text_bin = [text_bin[i:i+code_len] for i in range(0, len(text_bin), code_len)]
        text = [num2letter[num.to01()] for num in text_bin]
        text = "".join(text)
        return text
    
    @staticmethod
    def create(char_counter):
        code_len = ceil(log2(len(char_counter.keys())))
        code = {char: bin(i)[2:].zfill(code_len) for i, (char, _) in enumerate(char_counter.most_common())}
        return code

In [5]:
# Funckje testujące

def test(coder, text_path):
    text = load_file(text_path)
    
    code = coder.create(Counter(text))
    enc_text = coder.encode(text, code)

    coder.save(enc_text, code, TEST_TEXT_NAME, TEST_CODE_NAME)

    enc_text2, code2 = coder.load(TEST_TEXT_NAME, TEST_CODE_NAME)
    dec_text = coder.decode(enc_text2, code2)
    
    compression = os.path.getsize(TEST_TEXT_NAME) /  os.path.getsize(text_path)
    
    print("Teksty są identyczne:", dec_text == text)
    print("Kompresja: ", compression)

In [6]:
# Test
test(ConstLenCoder, "lab_huffman/norm_wiki_sample.txt")

Teksty są identyczne: True
Kompresja:  0.7500057697970542


## 2. Kodowanie Huffmana

In [22]:
class HuffmanCoder(Coder):
    @staticmethod
    def encode(text, code):
        text_bin = [code[letter] for letter in text]
        text_bin = "".join(text_bin)
        text_bin = bitarray(text_bin)
        return text_bin
    
    @staticmethod
    def decode(text_bin, code):
        text = []
        st3art = 0
        num2letter = {n: l for l, n in code.items()}
        
        for end in range(1, len(text_bin)+1):
            n = text_bin[start:end].to01()
            
            if num2letter.get(n) is not None:
                text.append(num2letter[n])
                start = end
                
        text = "".join(text)       
        return text 
    
    @staticmethod
    def create(char_counter):
        nodes = []
        
        for k, f in list(char_counter.most_common()):
            G = nx.DiGraph()
            G.add_node(k)
            nodes.append((G, k, f))
            
        root_id = 0
        
        while len(nodes) > 1:
            (n1, k1, f1), (n2, k2, f2) = nodes.pop(), nodes.pop()
            root_f = f1 + f2
            
            tree = nx.DiGraph()
            tree.add_node(root_id)  
            tree = nx.compose(tree, n1)
            tree = nx.compose(tree, n2)
            tree.add_edge(root_id, k1, v="0")
            tree.add_edge(root_id, k2, v="1")
            
            nodes.append((tree, root_id, root_f))
            nodes = sorted(nodes, key=lambda x: x[2], reverse=True)
            root_id += 1
            
        paths = [nx.shortest_path(nodes[0][0], nodes[0][1], node) for node in tree if tree.out_degree(node) == 0]
        code = defaultdict(list)
        
        for path in paths:
            char = path[-1]
            for v, u in pairwise(path):
                code[char].append(tree[v][u]["v"])
        
        code = {char: "".join(bin_list) for char, bin_list in code.items()}
        
                
#         pos = nx.drawing.nx_agraph.graphviz_layout(nodes[0][0], prog="dot")
#         nx.draw_networkx_edge_labels(nodes[0][0], pos=pos)
#         nx.draw(nodes[0][0], pos, with_labels=True)
        
        
        return code

In [24]:
test(HuffmanCoder, "lab_huffman/norm_wiki_sample.txt")

Teksty są identyczne: True
Kompresja:  0.5386327536687799
