In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  

# CMP 3002 
## Hash Tables

## Hash Table

- Data structure that organizes data using hash functions to support fast insertion and search

- Two kinds of hash tables:
    - hash set
    - hash map

- Hash set is an implementation to avoid storing repeated values
- Hash map allows us to store key-value (k,v) pairs
    - Can't have duplicate keys


### Hash tables

- Use a hash funciton to map keys to buckets
- When we insert a new key, the hash function decides which bucket they key should be assigned 
- When we search for a key, the hash table will use the same hash function to find the bucket

By hashing the indexing, we can do the mapping between the index and the location in memory where we can read the value quickly.


### Hash functions

- Function that can be used to map data of any size to a fixed-size values
- A hash function is usually a one-way function (it can't be inverted)
- Used to index hash tables 
- Cryptographic applications



### Hash functions

- The hash function is the most important component of a hash table
- Example: $F(x) = x % 5$
- We need to pick a function with a wide range to avoid collisions 
- The function should assign the key to the bucket in a uniform manner
- Ideally a one-one mapping between the key and the bucket
- Hash functions are usually not perfect and there is a tradeoff between the number of buckets and the capacity of a bucket


### Hash functions - collisions

- Collisions are inevitable
- We need an algorithm to solve the following questions:

    - how do we organize values in the same bucket?
    - what happens if the bucket has too many keys assigned?
    - how do we search a target value in a bucket?

## Complexity Analysis

Assuming $N$ keys in total:

- Space complexity is $O(N)$
- Search $O(1)$, depends on the design of the table. In the worst case this can be $O(N)$


### Exercise 1

Design a HashSet without using any built-in hash table libraries.

Implement HashSet class:

- void add(key) Inserts the value key into the HashSet.
- bool contains(key) Returns whether the value key exists in the HashSet or not.
- void remove(key) Removes the value key in the HashSet. If key does not exist in the HashSet, do nothing.
 

In [4]:
class Node:
    def __init__(self, value, next_node=None):
        self.value = value
        self.next_node = next_node
        
class Bucket:
    def __init__(self):
        self.head = Node(0)

    def insert(self, value):
        if not self.exists(value):
            node = Node(value, self.head.next)
            if self.head.next_node:
                node.next_node = self.head.next_node
            self.head.next_node = node

    def delete(self, value):
        prev = self.head
        curr = self.head.next_node
        while curr:
            if curr.value == value:
                prev.next_node = curr.next_node
                return
            prev = curr
            curr = curr.next_node

    def exists(self, value):
        curr = self.head.next_node
        while curr:
            if curr.value == value:
                return True
            curr = curr.next_node
        return False

In [5]:
class HashSet(object):

    def __init__(self):
        """
        Initialize your data structure here.
        """
        self.keyRange = 769
        self.bucketArray = [Bucket() for i in range(self.keyRange)]

    def _hash(self, key):
        return key % self.keyRange

    def add(self, key):
        bucketIndex = self._hash(key)
        self.bucketArray[bucketIndex].insert(key)

    def remove(self, key):
        bucketIndex = self._hash(key)
        self.bucketArray[bucketIndex].delete(key)

    def contains(self, key):
        bucketIndex = self._hash(key)
        return self.bucketArray[bucketIndex].exists(key)
    
# We want to store numbers from 0 to 1,000,000
# 1. store them in an array
# 2. store them in a hashset with keyRange=10,000

### Exercise 2

Design a HashMap without using any built-in hash table libraries.

Implement the HashMap class:

- HashMap() initializes the object with an empty map.
- void put(int key, int value) inserts a (key, value) pair into the HashMap. If the key already exists in the map, update the corresponding value.
- int get(int key) returns the value to which the specified key is mapped, or -1 if this map contains no mapping for the key.
- void remove(key) removes the key and its corresponding value if the map contains the mapping for the key.

In [3]:
class Bucket:
    def __init__(self):
        self.bucket = []

    def get(self, key):
        for (k, v) in self.bucket:
            if k == key:
                return v
        return -1

    def update(self, key, value):
        found = False
        for i in range(len(self.bucket)):
            kv = self.bucket[i]
            if key == kv[0]:
                self.bucket[i] = (key, value)
                found = True
                break

        if not found:
            self.bucket.append((key, value))

    def remove(self, key):
        for i in range(len(self.bucket)):
            kv = self.bucket[i]
            if key == kv[0]:
                del self.bucket[i]

In [None]:
class HashMap(object):

    def __init__(self):
        """
        Initialize your data structure here.
        """
        self.n = 2000
        self.hash_table = [Bucket() for i in range(self.n)]


    def put(self, key, value):
        hash_key = key % self.n
        self.hash_table[hash_key].update(key, value)


    def get(self, key):
        hash_key = key % self.n
        return self.hash_table[hash_key].get(key)


    def remove(self, key):
        hash_key = key % self.n
        self.hash_table[hash_key].remove(key)