# Huffman Codes

Learn about implementing Huffman encoding in Python by going through each item on this page.

In [None]:
message = "aardvark" # The message we will encode

## Build a frequency table:

In [None]:
def make_freq_table(message):
  """Create a dictionary of letters and their counts.
  """
  freq_table = {}
  for letter in message:
    freq_table[letter] = message.count(letter)

  return freq_table

In [None]:
make_freq_table(message)

In [None]:
# Another way:
def make_freq_table(message):
  return {n: message.count(n) for n in dict.fromkeys(message)}

In [None]:
make_freq_table(message)

## Define a single node in the priority queue

In [None]:
def node(name, weight):
  """Create one node containing a value and weight.
  """
  return {'name': name, 'weight': weight, 'left-child': None, 'right-child': None}

In [None]:
node('a', 3)

## Make a priority queue

In [None]:
def make_q(freq_table):
  """Build a queue of nodes, sorted in order from lowest weight to
  highest weight.
  """
  # Create the list of nodes, sorted by weight
  # Each item in freq_table.items() is a tuple of the form (name, weight). Example: ('a', 3)
  q = [node(key,value) for key,value in sorted(freq_table.items(), key=lambda item: item[1]) ] #item[1] = weight
  return q

In [None]:
freq_table = make_freq_table(message)
print(freq_table)

In [None]:
make_q(freq_table)

## Create a huffman tree

In [None]:
# combines two nodes into a single node

def make_tree(left, right):
  """Create a tree from a left and right child.
  """
  root = node(left['name'] + right['name'], left['weight'] + right['weight'])
  root['left-child'] = left
  root['right-child'] = right
  return root

In [None]:
left = node('a', 1)
right = node('b', 1)
make_tree(left, right)

In [None]:
# Inserts a node into the right place in a queue

def insert_into_tree(T, Q):
  """Insert a tree into the proper place in the queue.
  """
  for i,node in enumerate(Q):
    if node['weight'] >= T['weight']:
      Q.insert(i,T)
      return
  
  Q.append(T)

In [None]:
def huffman(Q):
  """The huffman algorithm.
  Works through a queue and creates a Huffman tree.
  """
  if len(Q) == 0:
    return Q
  if len(Q) == 1:
    return Q[0]
  
  while len(Q) > 1:
    T_left = Q[0]
    Q = Q[1:]
    T_right = Q[0]
    Q = Q[1:]

    T = make_tree(T_left, T_right)

    insert_into_tree(T, Q)
  
  return T

In [None]:
message = "aardvark"
# message = "mississippi"
# message = "huffman"
# message = "trees"
# message = "data"
# message = "compression"
#message = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum"

freq_table = make_freq_table(message)
print(freq_table)
Q = make_q(freq_table)
print(Q)
T = huffman(Q)
print(T)

In [None]:
def print_tree(T, level=0):
  """Print a huffman tree T.
  """
  root = T
  print(f'{root["name"]}:{root["weight"]}')
  if root['left-child'] is not None:
    print('    '*level, '|')
    print('    '*level, '|___', end='')
    print_tree(root['left-child'], level+1)

  if root['right-child'] is not None:
    print('    '*level, '|')
    print('    '*level, '|___', end='')
    print_tree(root['right-child'], level+1)
    
print_tree(T, 0)

## Extract the codes

In [None]:
codes = {}
def make_codes(T, code=''):
  """Walk tree to create the codes
  """
  root = T
  # root is a leaf
  if root['left-child'] is None and root['right-child'] is None:
    codes[root['name']] = code
    return
  
  # walk left side
  if root['left-child'] is not None:
    code += '0'
    make_codes(root['left-child'], code)
    code = code[:-1]
  
  # walk right side
  if root['right-child'] is not None:
    code += '1'
    make_codes(root['right-child'], code)
    code = code[:-1]



In [None]:
make_codes(T)
print(codes)

In [None]:
encoded_message = ''
for letter in message:
  encoded_message += codes[letter]

print(encoded_message)

## All at once

In [None]:
# Implement the Huffman Tree algorithm

def node(name, weight):
  """Create one node containing a value and weight.
  """
  return {'name': name, 'weight': weight, 'left-child': None, 'right-child': None}

# # Create a queue of nodes
# Q = [node('a', 8), node('b', 10), node('c', 12), node('d', 15), node('e', 20), node('f', 35)]
# Q = [node('d', 1), node('k', 1), node('v', 1), node('r', 2), node('a', 3)]

def make_freq_table(message):
  """Create a dictionary of letters and their counts.
  """
  freq_table = {}
  for letter in message:
    freq_table[letter] = message.count(letter)

  return freq_table

# freq_table = make_freq_table(message)
# print(freq_table)

def make_q(freq_table):
  """Build a queue of nodes, sorted in order from lowest weight to
  highest weight.
  """
  # Create the list of nodes, sorted by weight
  # Each item in freq_table.items() is a tuple of the form (name, weight). Example: ('a', 3)
  q = [node(key,value) for key,value in sorted(freq_table.items(), key=lambda item: item[1]) ] #item[1] = weight
  return q


# q = make_q(freq_table)
# print(f'queue: {q}')


def make_tree(left, right):
  """Create a tree from a left and right child.
  """
  root = node(left['name'] + right['name'], left['weight'] + right['weight'])
  root['left-child'] = left
  root['right-child'] = right
  return root


def insert_into_tree(T, Q):
  """Insert a tree into the proper place in the queue.
  """
  for i,node in enumerate(Q):
    if node['weight'] >= T['weight']:
      Q.insert(i,T)
      return
  
  Q.append(T)
      

def huffman(Q):
  """The huffman algorithm.
  Works through a queue and creates a Huffman tree.
  """
  if len(Q) == 0:
    return Q
  if len(Q) == 1:
    return Q[0]
  
  while len(Q) > 1:
    T_left = Q[0]
    Q = Q[1:]
    T_right = Q[0]
    Q = Q[1:]

    T = make_tree(T_left, T_right)

    insert_into_tree(T, Q)

  return T

codes = {}
def make_codes(T, code=''):
  """Walk tree to create the codes
  """
  root = T
  # root is a leaf
  if root['left-child'] is None and root['right-child'] is None:
    codes[root['name']] = code
    return
  
  # walk left side
  if root['left-child'] is not None:
    code += '0'
    make_codes(root['left-child'], code)
    code = code[:-1]
  
  # walk right side
  if root['right-child'] is not None:
    code += '1'
    make_codes(root['right-child'], code)
    code = code[:-1]
  

def print_tree(T, level=0):
  """Print a huffman tree T.
  """
  root = T
  print(f'{root["name"]}:{root["weight"]}')
  if root['left-child'] is not None:
    print('    '*level, '|')
    print('    '*level, '|___', end='')
    print_tree(root['left-child'], level+1)

  if root['right-child'] is not None:
    print('    '*level, '|')
    print('    '*level, '|___', end='')
    print_tree(root['right-child'], level+1)


message = "aardvark"
message = "mississippi"
message = "firstthreeodds"
message = "huffman"
message = "trees"
message = "data"
message = "compression"
message = "abcdefghijklmnopqrstuvwxyz lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum"
freq_table = make_freq_table(message)
print(freq_table)
Q = make_q(freq_table)
print(Q)
T = huffman(Q)
print(T)
print_tree(T)
make_codes(T)
print(codes)

## Scriptures

In [None]:
!curl -s -O https://byui-cse.github.io/cse280-course/scriptures.txt
!head -20 scriptures.txt

In [None]:
!cp scriptures.txt scriptures.tmp
!gzip scriptures.tmp
!wc -c scriptures.txt      # prints the byte count
!wc -c scriptures.tmp.gz


What is the compression ratio for gzip?

In [None]:
(6285497 - 1824084) / 6285497

In [None]:

with open('scriptures.txt') as f:
  message = f.read()


# a better way to make a freq table
from collections import Counter
freq_table = Counter(message)

print(freq_table)

In [None]:
Q = make_q(freq_table)
print(Q)

In [None]:
T = huffman(Q)

In [None]:
codes = {}
make_codes(T)
print(codes)

In [None]:
# calculate compression ratio

# How many unique symbols?
unique_symbols = len(freq_table.keys())
print(f'{unique_symbols} unique sybmols\n')

# How many bits would it take to represent that many unique symbols using fixed encoding?
from math import log2, ceil
bits_per_symbol_fixed = ceil(log2(unique_symbols))
print(f'{bits_per_symbol_fixed} bits per symbol')


# How many total bits would it take to represent the scriptures?
total_bits_fixed = bits_per_symbol_fixed * len(message)
print(f'Total bits fixed: {total_bits_fixed}')
print(f'Total bytes fixed: {total_bits_fixed // 8}\n')


# How many total bits to represent the scriptures using the original encoding of 8-bits per symbol?
print('Using 8-bits per symbol (ASCII):')
print(f'Total bits: {8*len(message)}')
print(f'Total bytes: {len(message)}\n')


# How many bits using our variable length encoding?
total_bits_variable = 0
for x in freq_table.keys():
  total_bits_variable += len(codes[x]) * freq_table[x]

bits_per_symbol_variable = total_bits_variable / len(message)

total_bits_ascii = 8*len(message)
print(f'{bits_per_symbol_variable:.2f} average bits per symbol (variable)')
print(f'Total bits variable: {total_bits_variable}\n')
print(f'Compression (compared with 7-bit fixed): {(total_bits_fixed - total_bits_variable) / total_bits_fixed}')
print(f'Compression (compared with 8-bit ASCII): {(total_bits_ascii - total_bits_variable) / total_bits_ascii}')

