# Part 1: Kafka Producer

In [1]:
import os
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer, TopicPartition
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError, UnknownTopicOrPartitionError

os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [2]:
import datetime, time, random, string

def one_station(name):
    # temp pattern
    month_avg = [27,31,44,58,70,79,83,81,74,61,46,32]
    shift = (random.random()-0.5) * 30
    month_avg = [m + shift + (random.random()-0.5) * 5 for m in month_avg]
    
    # rain pattern
    start_rain = [0.1,0.1,0.3,0.5,0.4,0.2,0.2,0.1,0.2,0.2,0.2,0.1]
    shift = (random.random()-0.5) * 0.1
    start_rain = [r + shift + (random.random() - 0.5) * 0.2 for r in start_rain]
    stop_rain = 0.2 + random.random() * 0.2

    # day's state
    today = datetime.date(2000, 1, 1)
    temp = month_avg[0]
    raining = False
    
    # gen weather
    while True:
        # choose temp+rain
        month = today.month - 1
        temp = temp * 0.8 + month_avg[month] * 0.2 + (random.random()-0.5) * 20
        if temp < 32:
            raining=False
        elif raining and random.random() < stop_rain:
            raining = False
        elif not raining and random.random() < start_rain[month]:
            raining = True

        yield (today.strftime("%Y-%m-%d"), name, temp, raining)

        # next day
        today += datetime.timedelta(days=1)
        
def all_stations(count=10, sleep_sec=1):
    assert count <= 26
    stations = []
    for name in string.ascii_uppercase[:count]:
        stations.append(one_station(name))
    while True:
        for station in stations:
            yield next(station)
        time.sleep(sleep_sec)

In [3]:
admin = KafkaAdminClient(bootstrap_servers=["kafka:9092"])
try:
    admin.delete_topics(["stations", "stations-json"])
    print("deleted")
except UnknownTopicOrPartitionError:
    print("cannot delete (may not exist yet)")

time.sleep(1)
admin.create_topics([NewTopic("stations", 6, 1)])
admin.create_topics([NewTopic("stations-json", 6, 1)])
admin.list_topics()

deleted


['stations-json', 'stations', '__consumer_offsets']

In [4]:
import weather_pb2
from kafka import KafkaAdminClient, KafkaProducer
import threading
import json
import random

def produce():
    producer = KafkaProducer(bootstrap_servers=["kafka:9092"], retries=10, acks="all")

    for date, station, degrees, raining in all_stations(15):
        report = weather_pb2.Report(
            date = date, station = station, degrees = degrees, raining = raining
        )
        serialized_report = report.SerializeToString()

        # Serialize the JSON data
        json_data = json.dumps({
            "date": date,
            "station": station,
            "degrees": degrees,
            "raining": 1 if raining else 0
        }).encode("utf-8")

        # Send the data to the respective Kafka topics
        producer.send("stations", key = station.encode("utf-8"), value = serialized_report)
        producer.send("stations-json", key = station.encode("utf-8"), value = json_data)

producer_thread = threading.Thread(target=produce)
producer_thread.start()

# Part 2: Kafka Consumer

In [5]:
import os, json

for partition in range(6):
    path = f"partition-{partition}.json"
    if os.path.exists(path):
        os.remove(path)

In [6]:
def load_partition(partition_num):
    path = f"partition-{partition_num}.json"
    if os.path.exists(path):
        with open(path, "r") as file:
            return json.load(file)
    else:
        return {
            "partition": partition_num,
            "offset": 0,
            "stations": {}
        }
def save_partition(partition):
    path = f"partition-{partition['partition']}.json"
    with open(path, "w") as file:
        json.dump(partition, file)

In [7]:
def consume(part_nums=[], iterations=10):
    consumer = KafkaConsumer(bootstrap_servers=["kafka:9092"])
    partitions = [TopicPartition("stations", p) for p in part_nums]
    consumer.assign(partitions)

    # PART 1: initialization
    partitions = {}
    for partition_num in part_nums:
        partition = load_partition(partition_num)
        partitions[partition_num] = partition
        offset = partition["offset"]
        consumer.seek(TopicPartition("stations", partition_num), offset)

    # PART 2: process batches
    for i in range(iterations):
        batch = consumer.poll(1000)
        for messages in batch.values():
            for message in messages:
                report = weather_pb2.Report()
                report.ParseFromString(message.value)
                partition = partitions[message.partition]
                station = report.station

                stations = partition["stations"]
                if station not in stations:
                    stations[station] = {
                        "sum": report.degrees,
                        "count": 1,
                        "avg": report.degrees,
                        "start": report.date,
                        "end": report.date
                    }

                if report.date > stations[station]["end"]:
                    stations[station]["sum"] += report.degrees
                    stations[station]["count"] += 1
                    stations[station]["avg"] = stations[station]["sum"] / stations[station]["count"]
                    stations[station]["end"] = report.date
                
                partition["offset"] = message.offset + 1
                save_partition(partition)
    print("exiting")

for i in range(2):
    print("ROUND", i)
    t1 = threading.Thread(target=consume, args=([0,1], 30))
    t2 = threading.Thread(target=consume, args=([2,3], 30))
    t3 = threading.Thread(target=consume, args=([4,5], 30))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()


ROUND 0
exiting
exiting
exiting
ROUND 1
exiting
exiting
exiting


In [8]:
!cat partition*.json

{"partition": 0, "offset": 51, "stations": {"N": {"sum": 1907.0000481762952, "count": 51, "avg": 37.39215780737834, "start": "2000-01-01", "end": "2000-02-20"}}}{"partition": 1, "offset": 102, "stations": {"E": {"sum": 1052.364224888139, "count": 51, "avg": 20.63459264486547, "start": "2000-01-01", "end": "2000-02-20"}, "O": {"sum": 1809.8934323964365, "count": 51, "avg": 35.48810651757719, "start": "2000-01-01", "end": "2000-02-20"}}}{"partition": 2, "offset": 174, "stations": {"F": {"sum": 1669.36521184176, "count": 58, "avg": 28.78215882485793, "start": "2000-01-01", "end": "2000-02-27"}, "I": {"sum": 2445.689321747112, "count": 58, "avg": 42.16705727150193, "start": "2000-01-01", "end": "2000-02-27"}, "J": {"sum": 2909.9734211451746, "count": 58, "avg": 50.17195553698577, "start": "2000-01-01", "end": "2000-02-27"}}}{"partition": 3, "offset": 174, "stations": {"D": {"sum": 1652.0992905319451, "count": 58, "avg": 28.484470526412846, "start": "2000-01-01", "end": "2000-02-27"}, "G": 