# Heaps

### 9.1 Compute the running median

In [None]:
import heapq

array = [2, 1, 5, 7, 2, 0, 5]

def get_median(min_heap, max_heap):
    if len(min_heap) > len(max_heap):
        return min_heap[0]
    elif len(min_heap) < len(max_heap):
        return -1 * max_heap[0]
    else:
        return (min_heap[0] + (-1 * max_heap[0])) / 2
    
def add(num, min_heap, max_heap):
    # If empty, then just add it to the min heap.
    if len(min_heap) + len(max_heap) < 1:
        heapq.heappush(min_heap, num)
        return
    
    median = get_median(min_heap, max_heap)
    if num > median:
        heapq.heappush(min_heap, num)
    else:
        heapq.heappush(max_heap, -1 * num)
        
def rebalance(min_heap, max_heap):
    if len(min_heap) > len(max_heap) + 1:
        root = heapq.heappop(min_heap)
        heapq.heappush(max_heap, -1 * root)
    elif len(max_heap) > len(min_heap) + 1:
        root = heapq.heappop(max_heap)
        heapq.heappush(min_heap, -1 * root)
        
def print_median(min_heap, max_heap):
    print(get_median(min_heap, max_heap))
    
def running_median(array):
    min_heap = []
    max_heap = []
    for num in array:
        add(num, min_heap, max_heap)
        rebalance(min_heap, max_heap)
        print_median(min_heap, max_heap)
        
running_median(array)

### 9.2 Find most similar websites

In [None]:
array = [('google.com', 1), ('google.com', 3), ('google.com', 5), 
 ('pets.com', 1), ('pets.com', 2), ('yahoo.com', 6),
 ('yahoo.com', 2), ('yahoo.com', 3), ('yahoo.com', 4), ('yahoo.com', 5),
 ('wikipedia.org', 4), ('wikipedia.org', 5), ('wikipedia.org', 6), 
 ('wikipedia.org', 7), ('bing.com', 1), ('bing.com', 3), ('bing.com', 5),
 ('bing.com', 6)]

In [None]:
# Jaccard index
def compute_similarity(a, b, visitors):
    return len(visitors[a] & visitors[b]) / len(visitors[a] | visitors[b])

In [None]:
import heapq
from collections import defaultdict

def top_pairs(log, k):
    visitors = defaultdict(set)
    
    for site, user in log:
        visitors[site].add(user)
        
    pairs = []
    sites = list(visitors.keys())
    
    for _ in range(k):
        heapq.heappush(pairs, (0, ('', '')))
    
    for i in range(len(sites) - 1):
        for j in range(i + 1, len(sites)):
            score = compute_similarity(sites[i], sites[j], visitors)
            heapq.heappushpop(pairs, (score, (sites[i], sites[j])))
            
    return [pair[1] for pair in pairs]
    
top_pairs(array, 1)