# Part 1: Kafka Producer

In [1]:
import datetime, time, random, string

def one_station(name):
    # temp pattern
    month_avg = [27,31,44,58,70,79,83,81,74,61,46,32]
    shift = (random.random()-0.5) * 30
    month_avg = [m + shift + (random.random()-0.5) * 5 for m in month_avg]
    
    # rain pattern
    start_rain = [0.1,0.1,0.3,0.5,0.4,0.2,0.2,0.1,0.2,0.2,0.2,0.1]
    shift = (random.random()-0.5) * 0.1
    start_rain = [r + shift + (random.random() - 0.5) * 0.2 for r in start_rain]
    stop_rain = 0.2 + random.random() * 0.2

    # day's state
    today = datetime.date(2000, 1, 1)
    temp = month_avg[0]
    raining = False
    
    # gen weather
    while True:
        # choose temp+rain
        month = today.month - 1
        temp = temp * 0.8 + month_avg[month] * 0.2 + (random.random()-0.5) * 20
        if temp < 32:
            raining=False
        elif raining and random.random() < stop_rain:
            raining = False
        elif not raining and random.random() < start_rain[month]:
            raining = True

        yield (today.strftime("%Y-%m-%d"), name, temp, raining)

        # next day
        today += datetime.timedelta(days=1)
        
def all_stations(count=10, sleep_sec=1):
    assert count <= 26
    stations = []
    for name in string.ascii_uppercase[:count]:
        stations.append(one_station(name))
    while True:
        for station in stations:
            yield next(station)
        time.sleep(sleep_sec)

In [2]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer, TopicPartition
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError, UnknownTopicOrPartitionError

admin = KafkaAdminClient(bootstrap_servers=["kafka:9092"])
try:
    admin.delete_topics(["stations", "stations-json"])
    print("deleted")
except UnknownTopicOrPartitionError:
    print("cannot delete (may not exist yet)")

time.sleep(1)
admin.create_topics([NewTopic("stations", 6, 1)])
admin.create_topics([NewTopic("stations-json", 6, 1)])
admin.list_topics()

deleted


['stations-json', 'stations', '__consumer_offsets']

In [3]:
# Building protobuf file
! python3 -m grpc_tools.protoc -I=. --python_out=. stations.proto

In [3]:
import json, threading
from stations_pb2 import *

def produce():
    producer = KafkaProducer(bootstrap_servers=["kafka:9092"], retries=10, acks='all')
    
    for date, station, degrees, raining in all_stations(15):
        # protobuf
        r = Report(date=date, station=station, degrees=degrees, raining=raining)
        value1 = r.SerializeToString()
        key = bytes(station, "utf-8")
        producer.send("stations", value=value1, key=key)
        
        # JSON
        value2 = {"date":date, "station":station, "degrees":degrees, "raining":raining}
        
        if raining == False:
            value2["raining"] = 0
        else:
            value2["raining"] = 1
            
        value2 = bytes(json.dumps(value2), "utf-8")
        producer.send("stations-json", value=value2, key=key)
        
# never join thread because we want it to run forever
threading.Thread(target=produce, args=()).start()

# Part 2: Kafka Consumer

In [4]:
import os, json

for partition in range(6):
    path = f"partition-{partition}.json"
    if os.path.exists(path):
        os.remove(path)

In [5]:
# Loading and saving a partition
def load_partition(partition_num):
    path = f"partition-{partition_num}.json"
    if os.path.exists(path):
        print("TRUE")
        with open(path, "r") as file:
            return json.load(file)
    else:
        return {"offset":None, "partition":partition_num}
    
def save_partition(partition):
    path = f"partition-{partition['partition']}.json"
    with open(path, "w") as file:
        json.dump(partition, file)

In [6]:
def consume(part_nums=[], iterations=10):
    consumer = KafkaConsumer(bootstrap_servers=["kafka:9092"])
    consumer.assign([TopicPartition("stations", part_num) for part_num in part_nums])

    # PART 1: initialization
    partitions = {} # key=partition num, value=snapshot dict
    for partition_num in part_nums:
        res = load_partition(partition_num)
        if res["offset"] == None:
            consumer.seek_to_beginning()
        else:
            offset = res["offset"]
            consumer.seek(TopicPartition("stations", partition_num), offset)
        save_partition(res)
        partitions[partition_num] = load_partition(partition_num)
        
    # PART 2: process batches
    for i in range(iterations):
        batch = consumer.poll(1000) # 1s timeout
        for topic, messages in batch.items():
            temp_count = 0
            temp_sum = 0
            for msg in messages:
                r = Report.FromString(msg.value)
                partition = msg.partition
                station_key = r.station
                temp = r.degrees
                current_offset = consumer.position(topic)
                
                temp_count += 1
                temp_sum += temp
                # avg = temp_sum / temp_count
                current_date = r.date

                if station_key not in partitions[partition]:
                    partitions[partition][station_key] = {}
                    partitions[partition][station_key]["start"] = current_date
                    partitions[partition][station_key]["count"] = temp_count
                    partitions[partition][station_key]["sum"] = temp_sum
                    partitions[partition][station_key]["avg"] = 0
                else:
                    previous_date = partitions[partition][station_key]["end"]
                    if current_date <= previous_date:
                        continue
                    
                partitions[partition][station_key]["count"] += temp_count
                partitions[partition][station_key]["sum"] += temp_sum
                partitions[partition][station_key]["avg"] = partitions[partition][station_key]["sum"] / partitions[partition][station_key]["count"]
                partitions[partition][station_key]["end"] = current_date
                partitions[partition]["offset"] = current_offset
                
        for part_num in partitions:
            save_partition(partitions[partition])
    print("exiting")

for i in range(2):
    print("ROUND", i)
    t1 = threading.Thread(target=consume, args=([0,1], 30))
    t2 = threading.Thread(target=consume, args=([2,3], 30))
    t3 = threading.Thread(target=consume, args=([4,5], 30))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()

ROUND 0
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE


Exception in thread Thread-6 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_226/1061262091.py", line 49, in consume
NameError: name 'current_offset' is not defined
Exception in thread Thread-7 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_226/1061262091.py", line 49, in consume
NameError: name 'current_offset' is not defined
Exception in thread Thread-8 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._targ

ROUND 1
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE


Exception in thread Thread-9 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_226/1061262091.py", line 49, in consume
Exception in thread Thread-10 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
NameError: name 'current_offset' is not defined
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_226/1061262091.py", line 49, in consume
NameError: name 'current_offset' is not defined
Exception in thread Thread-11 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._ta

In [9]:
!cat partition*.json

{"offset": 61, "partition": 0, "N": {"start": "2000-01-01", "end": "2000-03-01", "avg": 54.239378098600966, "count": 1, "sum": 54.239378098600966}}{"offset": 124, "partition": 1, "E": {"start": "2000-01-01", "end": "2000-03-02", "avg": 29.84173081380031, "count": 1, "sum": 29.84173081380031}, "O": {"start": "2000-01-01", "end": "2000-03-02", "avg": 33.29131050256177, "count": 2, "sum": 66.58262100512354}}{"offset": 180, "partition": 2, "F": {"start": "2000-01-01", "end": "2000-02-29", "avg": 31.970912694630492, "count": 1, "sum": 31.970912694630492}, "I": {"start": "2000-01-01", "end": "2000-02-29", "avg": 33.619682135409555, "count": 2, "sum": 67.23936427081911}, "J": {"start": "2000-01-01", "end": "2000-02-29", "avg": 30.661771334919717, "count": 3, "sum": 91.98531400475915}}{"offset": 189, "partition": 3, "D": {"start": "2000-01-01", "end": "2000-03-03", "avg": 50.80060022340461, "count": 1, "sum": 50.80060022340461}, "G": {"start": "2000-01-01", "end": "2000-03-03", "avg": 45.35397