# Exercise 6: Data Streams [100 pts]

## 1. Querying Users [30 pts]

In [None]:
from collections import defaultdict

query_counts = defaultdict(int)  # key = query string, value = count

# Simulate receiving events from click-feeder.py
for ev in dispatcher.launch():
    query = ev['msg']  # assuming 'msg' holds the query string
    query_counts[query] += 1

    max_evs -= 1
    if max_evs == 0:
        break

# Now classify each query by how many times it was seen
frequency_buckets = defaultdict(list)  # key = frequency, value = list of queries

for query, count in query_counts.items():
    frequency_buckets[count].append(query)

# Print summary
for freq in sorted(frequency_buckets.keys()):
    print(f"Queries seen {freq} time(s): {len(frequency_buckets[freq])} queries")


Strategy: The simplest and most accurate method here would be to process every event during the 12-minute window, since it's feasible (only 6000 events, and memory is not a constraint). We can then tally query frequencies using a dictionary.

## 2. Bloom Filter [40 pts]

### [10 pts] Create a Bloom Filter, approximately 1000-2000 bits in size, for detecting bad words (i.e., AFINN of -4 or -5). It should be designed to run in Spark.

In [None]:
import base64
import hashlib

class BloomFilter:
    def __init__(self, size=2048, num_hashes=3):
        self.size = size
        self.num_hashes = num_hashes
        self.bit_vector = [0] * size

    def _hashes(self, word):
        hashes = []
        for i in range(self.num_hashes):
            hash_digest = hashlib.md5(f"{word}_{i}".encode()).hexdigest()
            index = int(hash_digest, 16) % self.size
            hashes.append(index)
        return hashes

    def add(self, word):
        for idx in self._hashes(word):
            self.bit_vector[idx] = 1

    def check(self, word):
        return all(self.bit_vector[idx] for idx in self._hashes(word))

    def to_base64(self):
        # Pack bits into bytes
        bitstring = ''.join(map(str, self.bit_vector))
        byte_array = bytearray(int(bitstring[i:i+8], 2) for i in range(0, len(bitstring), 8))
        return base64.b64encode(byte_array).decode()


### [10 pts] The bit vector should be placed in HDFS  as a Base64-encoded text file and loaded into Spark from HDFS.

In [None]:
bad_words = [...]  # List of ~65 words with AFINN scores -4 or -5

bf = BloomFilter(size=2048, num_hashes=3)
for word in bad_words:
    bf.add(word.lower())

with open("bloomfilter.b64", "w") as f:
    f.write(bf.to_base64())

I ran the following command to put it in HDFS:

        hdfs dfs -put bloomfilter.b64 /user/brianfarrell/bloomfilter.b64


### [15 pts] Integrate the Bloom Filter into Spark such that every arriving sentence is examined and passed along if none of the words in the sentence are bad words. Sentences that do contain bad words should be suppressed.

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import base64

# Initialize Spark
sc = SparkContext(appName="DrunkSpeechFilter")
ssc = StreamingContext(sc, 1)

# Load Bloom filter from HDFS
b64_string = sc.textFile("hdfs:///user/yourname/bloomfilter.b64").collect()[0]
bit_array = list(bin(int.from_bytes(base64.b64decode(b64_string), 'big'))[2:].zfill(2048))
bit_vector = list(map(int, bit_array))

# Reconstruct BloomFilter logic
import hashlib

def get_hashes(word, size=2048, num_hashes=3):
    return [int(hashlib.md5(f"{word}_{i}".encode()).hexdigest(), 16) % size for i in range(num_hashes)]

def is_bad(word):
    return all(bit_vector[idx] for idx in get_hashes(word.lower()))

def contains_bad_words(sentence):
    return any(is_bad(word) for word in sentence.split())

# Simulated DStream (you could replace with socketTextStream for real input)
lines = ssc.socketTextStream("localhost", 9999)

# Filter clean sentences
clean_sentences = lines.filter(lambda sentence: not contains_bad_words(sentence))

# Output
clean_sentences.pprint()

ssc.start()
ssc.awaitTermination()

## 3. Counting Unique Users [30 pts]

### Modify read_stdin.py to implement the HyperLogLog algorithm. Increase the number of senders and decrease the (μ,σ) of the delay between queries until the receiver can no longer keep up! Draw a graph of the estimated number of users as a function of elapsed time.


In [None]:
#!/usr/bin/env python3

import sys
import time
import hyperloglog

# Initialize HyperLogLog with error rate ~1% (~1KB memory usage)
hll = hyperloglog.HyperLogLog(0.01)

# Track time
start_time = time.time()
interval = 1  # seconds between measurements
next_tick = start_time + interval

# Optional: for logging (to create a graph later if desired)
log = []

print("Elapsed(s)\tEstimated Unique Users", flush=True)

try:
    for line in sys.stdin:
        try:
            user, query, timestamp = line.strip().split('\t')
        except ValueError:
            continue  # skip malformed lines

        hll.add(user)

        now = time.time()
        if now >= next_tick:
            elapsed = int(now - start_time)
            est_count = len(hll)
            print(f"{elapsed}\t{est_count}", flush=True)
            log.append((elapsed, est_count))
            next_tick += interval

except KeyboardInterrupt:
    print("Streaming stopped by user.")
    sys.exit(0)


Command: python3 click-feeder.py | python3 read_stdin.py