# Assignment 8 -  Multimedia Lab
### Name - Anirban Dey
### Roll -  002111001108

# Problem Description

Implement Huffman algorithm, Shanon - Fano algorithm

# Documentation

In [None]:
class Node():
      def __init__(self,num,val):
        self.num = num # represents the number whose frequency is measured
        self.val = val # frequence or sum of frequence
        self.left = None
        self.right = None

      def __lt__(self,other):
        return self.num < other.num

In [None]:
from collections import Counter
from typing import List,Dict
import os
import time
import heapq
import bitarray as ba
import pickle
import tarfile
import struct

# assert bitarray.test().wasSuccessful()

class Huffman():

  def __init__(self):
    pass

  def encode(self,input_path:str,output_path:str):
    """
      input_path : uncompressed file
      output_path : compressed file
    """
    init_time = time.time()
    data_list:List[int] = self.getDataListFromFile(input_path)
    huffman_code = self.generateHuffmanCode(data_list)
    encoded_bitarray = ba.bitarray()
    for num in data_list:
      encoded_bitarray.extend(ba.bitarray(huffman_code[num]))

    encoded_bitarray_file_path="tmp_file1.bin"
    dictionary_map_file_path="tmp_file2.bin"
    tmp_file_path_list = [encoded_bitarray_file_path,dictionary_map_file_path]

    with open(encoded_bitarray_file_path,"wb") as f:
      encoded_bitarray.tofile(f)
    with open(dictionary_map_file_path,"wb") as f:
      pickle.dump(huffman_code,f)

    self.merge_files(tmp_file_path_list,output_path)

    average_code_length = sum(len(val) for val in huffman_code.values()) / len(huffman_code.values())
    finish_time = time.time()
    original_size = os.path.getsize(input_path)
    compressed_size = os.path.getsize(output_path)
    compression_ratio = original_size / compressed_size
    efficiency = (1-(compressed_size/original_size))*100
    print("Time taken to compress file (in seconds) : ", finish_time - init_time )
    print("Size of input file (in bytes) : ",original_size)
    print("Size of output file (in bytes) : ",compressed_size)
    print("Average Code Length : ",average_code_length)
    print("Compression Ratio : ",compression_ratio)
    print(f"Efficiency : {efficiency}%")
  

    # with tarfile.open(output_path,"w") as tar:
    #   for file_path in tmp_file_path_list:
    #     tar.add(file_path)
    #   tar.close()

    for file_path in tmp_file_path_list:
      if os.path.exists(file_path) :
        os.unlink(file_path)

  def decode(self,input_path:str,output_path:str):
    """
      input_path : compressed file
      output_path : decompressed file
    """

    # tmp_file_path_list = [encoded_biarray_file_path,dictionary_map_file_path]
    tmp_file_path_list = self.separate_files(merged_filename=input_path,output_dir=os.getcwd())
    encoded_bitarray_file_path = tmp_file_path_list[0]
    dictionary_map_file_path = tmp_file_path_list[1]

    # with tarfile.open(input_path,"r") as tar:
    #   # file_names = tar.getnames()
    #   tar.extractall() 

    encoded_bitarray = ba.bitarray()
    with open(encoded_bitarray_file_path,"rb") as f:
      encoded_bitarray.fromfile(f)

    with open(dictionary_map_file_path,"rb") as f:
      huffman_code = pickle.load(f)

    # print("Encoded Bitarray")
    # print(encoded_bitarray)
    # print("Huffman Code")
    # print(huffman_code)

    for file_path in tmp_file_path_list:
      if os.path.exists(file_path) :
        os.unlink(file_path)

    with open(output_path,"wb") as of:
      current_code = ba.bitarray()
      for bit in encoded_bitarray:
          current_code.append(bit)
          # Check if the current code is in the Huffman code dictionary
          if current_code.to01() in huffman_code.values():
              # Find the corresponding symbol for the current code using the dictionary
              symbol = [k for k, v in huffman_code.items() if v == current_code.to01()][0]
             
              # print(symbol)
              # Convert the integer to a single byte and write it to the file
              # byte = bytes([symbol])
              byte = struct.pack('B',symbol)
              # Append the symbol to the the output file
              of.write(byte)
              # Reset the current code
              current_code = ba.bitarray()

  def merge_files(self,file_list:List[str], merged_filename:str):
      # Initialize list to store file sizes and contents
      merged_content = []

      # Iterate through each file in the list
      for filename in file_list:
          # Get the size of the file
          file_size = os.path.getsize(filename)
          
          # Read the content of the file
          with open(filename, 'rb') as file:
              file_content = file.read()
          
          # Add size and content to the merged list
          merged_content.append(file_size.to_bytes(8, byteorder='big'))
          merged_content.append(file_content)

      # Write the merged content to a custom merged file
      with open(merged_filename, 'wb') as merged_file:
          for item in merged_content:
              merged_file.write(item)

  def separate_files(self,merged_filename:str, output_dir:str):
      # Read the content of the merged file
      with open(merged_filename, 'rb') as merged_file:
          merged_content = merged_file.read()

      # Initialize list to store the names of separated files
      separated_files = []
      # Iterate through the merged content to separate files
      index = 0
      file_number = 0
      while index < len(merged_content):
          # Get size of the file
          file_size = int.from_bytes(merged_content[index:index+8], byteorder='big')
          index += 8
          file_number += 1
          
          # Get content of the file
          file_content = merged_content[index:index+file_size]
          index += file_size
          
          # Write content to separate file
          output_filename = os.path.join(output_dir, f"tmp_file{file_number}.bin")
          separated_files.append(output_filename)
          with open(output_filename, 'wb') as output_file:
              output_file.write(file_content)
      return separated_files

  def getDataListFromFile(self,file_path:str)-> List[int]:
    data_list : List[int] = []
    with open(file_path,"rb") as f:
      byte = f.read(1) # Read one byte at a time
      while byte :
          byte_num : int = ord(byte)
          # print(byte_num)
          data_list.append(byte_num)
          byte = f.read(1)

    # print("Bytes stored in the file")
    # print(data_list)
    return data_list

  def isLeafNode(self,node:Node):
    return node != None and node.left == None and node.right == None

  def dfs(self,node:Node,cur_str:str,huffman_code:Dict[int,str]):
    if node == None:
      return

    if self.isLeafNode(node):
      # print("Huffman Code assigned : ",node.num,cur_str)
      huffman_code[node.num] = cur_str

    # print(vars(node))
    self.dfs(node.left,cur_str+"0",huffman_code)
    self.dfs(node.right,cur_str+"1",huffman_code)

  def generateHuffmanCode(self,data_list:List[int])->None:

    # calculate frequency distribution of numbers
    freqMap = Counter(data_list)
    # print("Frequency Distribution")
    # print(freqMap)

    pq = []
    for char,freq in freqMap.items():
      # heapq.heappush(pq,(freq,char))
      heapq.heappush(pq,(freq,Node(num=char,val=freq)))

    while len(pq) > 1 :
      top1 = heapq.heappop(pq)
      top2 = heapq.heappop(pq)

      if top1[1].val > top1[1].val:
        top1,top2 = top2,top1

      # print(top1,top2)
      # since this is an internal node its key will -1 it does not correspond to any specific character
      sum_val = top1[0] + top2[0]
      newNode = Node(num = -1, val=sum_val)
      newNode.left = top1[1]
      newNode.right = top2[1]

      heapq.heappush(pq,(sum_val,newNode))

    root_of_tree = pq[0][1]
    # print("Root of tree : ",vars(root_of_tree))
    huffman_code = {}
    self.dfs(root_of_tree,"",huffman_code)
    # print("Huffman Code")
    # print(huffman_code)
    return huffman_code
    

In [None]:

hf = Huffman()
hf.encode(input_path="test_input.txt",output_path="test_compressed_file.bin")
hf.decode(input_path="test_compressed_file.bin",output_path="test_decompressed_file.png")
