In [None]:
import math, queue
from collections import Counter

class TreeNode(object):
    # we assume data is a tuple (frequency, character)
    def __init__(self, left=None, right=None, data=None):
        self.left = left
        self.right = right
        self.data = data
    def __lt__(self, other):
        return(self.data < other.data)
    def children(self):
        return((self.left, self.right))
    
def get_frequencies(fname):
    f=open(fname, 'r')
    C = Counter()
    for l in f.readlines():
        C.update(Counter(l))
    return C

# given a dictionary f mapping characters to frequencies, 
# create a prefix code tree using Huffman's algorithm
def make_huffman_tree(f):
    p = queue.PriorityQueue()
    # construct heap from frequencies, the initial items should be
    # the leaves of the final tree
    for c in f.keys():
        p.put(TreeNode(None,None,(f[c], c)))
    while (p.qsize() > 1):
        # TODO
        l = p.get()
        r = p.get()
        p.put(TreeNode(l, r, (l.data[0]+r.data[0], "")))
        
    # return root of the tree
    return p.get()

# perform a traversal on the prefix code tree to collect all encodings
def get_code(node, prefix="", code={}):
    # TODO
    if ((node.left == None) and (node.right == None)):
        code[node.data[1]] = prefix
    if (node.left != None):
        get_code(node.left,prefix+"0", code)
    if (node.right != None):
        get_code(node.right,prefix+"1", code)
    return(code)
    
# given an alphabet and frequencies, compute the cost of a fixed length encoding
def fixed_length_cost(f):
    num_bits = math.ceil(math.log2(len(f.keys())))
    return(sum([num_bits*f[x] for x in f.keys()]))

# given a Huffman encoding and character frequencies, compute cost of a Huffman encoding
def huffman_cost(C, f):
    return(sum([len(C[x])*f[x] for x in f.keys()]))

for fname in ['alice29.txt', 'asyoulik.txt', 'f1.txt', 'fields.c', 'grammar.lsp']:
    print(fname)
    f = get_frequencies(fname)
    fc =fixed_length_cost(f)
    print("Fixed-length cost:  %d" % fc)

    T = make_huffman_tree(f)
    C = get_code(T)
    hc = huffman_cost(C, f)
    print("Huffman cost:  %d" % hc)
    h2f = hc/fc
    print("Huffman v.s. Fixed-length cost:  %f" % h2f)


**d)** Test your implementation of Huffman coding on the 5 given text
files, and fill out a table of the encoding cost of each file for
fixed-length and Huffman. Fill out a final column which gives the
ratio of Huffman coding cost to fixed-length coding cost. Do you see a
consistent trend? If so, what is it?

**Answer**: 

`alice29.txt`

Fixed-length cost:  1039367

Huffman cost:  676374

Huffman v.s. Fixed-length cost:  0.650756

`asyoulik.txt`

Fixed-length cost:  876253

Huffman cost:  606448

Huffman v.s. Fixed-length cost:  0.692092

`f1.txt`

Fixed-length cost:  1340

Huffman cost:  826

Huffman v.s. Fixed-length cost:  0.616418

`fields.`

Fixed-length cost:  78050

Huffman cost:  56206

Huffman v.s. Fixed-length cost:  0.720128

`grammar.lsp`

Fixed-length cost:  26047

Huffman cost:  17356

Huffman v.s. Fixed-length cost:  0.666334



**e)** Suppose that we used Huffman coding on a document with alphabet $\Sigma$ in
  which every character had the same frequency. What is the expected
  cost of a Huffman encoding for the document? Is it consistent across
  documents?

**Answer**: If all characters had equal frequency, then characters would be paired
arbitrarily so that we would build a balanced tree bottom up. The
 depth of any leaf would thus be logarithmic in the alphabet
 size. Note that we said expected but in fact we have not provided any
 randomness (i.e., we might have instead said instead that all characters are equally likely to occur).