# Part 1: Kafka Producer

In [1]:
import datetime, time, random, string
import os, json

def one_station(name):
    # temp pattern
    month_avg = [27,31,44,58,70,79,83,81,74,61,46,32]
    shift = (random.random()-0.5) * 30
    month_avg = [m + shift + (random.random()-0.5) * 5 for m in month_avg]
    
    # rain pattern
    start_rain = [0.1,0.1,0.3,0.5,0.4,0.2,0.2,0.1,0.2,0.2,0.2,0.1]
    shift = (random.random()-0.5) * 0.1
    start_rain = [r + shift + (random.random() - 0.5) * 0.2 for r in start_rain]
    stop_rain = 0.2 + random.random() * 0.2

    # day's state
    today = datetime.date(2000, 1, 1)
    temp = month_avg[0]
    raining = False
    
    # gen weather
    while True:
        # choose temp+rain
        month = today.month - 1
        temp = temp * 0.8 + month_avg[month] * 0.2 + (random.random()-0.5) * 20
        if temp < 32:
            raining=False
        elif raining and random.random() < stop_rain:
            raining = False
        elif not raining and random.random() < start_rain[month]:
            raining = True

        yield (today.strftime("%Y-%m-%d"), name, temp, raining)

        # next day
        today += datetime.timedelta(days=1)
        
def all_stations(count=10, sleep_sec=1):
    assert count <= 26
    stations = []
    for name in string.ascii_uppercase[:count]:
        stations.append(one_station(name))
    while True:
        for station in stations:
            yield next(station)
        time.sleep(sleep_sec)

In [2]:
stations = []
for name in string.ascii_uppercase[:15]:
    stations.append(one_station(name))

In [3]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer, TopicPartition
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError, UnknownTopicOrPartitionError

admin = KafkaAdminClient(bootstrap_servers=["kafka:9092"])
try:
    admin.delete_topics(["stations", "stations-json"])
    print("deleted")
except UnknownTopicOrPartitionError:
    print("cannot delete (may not exist yet)")

time.sleep(1)
admin.create_topics([NewTopic("stations", 6, 1)])
admin.create_topics([NewTopic("stations-json", 6, 1)])
admin.list_topics()

deleted


['animals-json', 'stations-json', 'stations', '__consumer_offsets']

In [4]:
from weather_pb2 import *
import threading

def produce():
    producer = KafkaProducer(bootstrap_servers=["kafka:9092"], acks="all", retries=10)
    
    for date, station, degrees, raining in all_stations(15):
        key= bytes(station, "utf-8")
        r = Report(date=date, station=station, degrees=degrees, raining=raining)
        value = r.SerializeToString()
        producer.send("stations", value=value, key=key)
        
        # JSON
        value = {"date": date, "station": station, "degrees": degrees, "raining":int(raining)}
        value = bytes(json.dumps(value), "utf-8")
        producer.send("stations-json", value=value, key=key)
        
        time.sleep(1)
        
threading.Thread(target=produce).start()

# Part 2: Kafka Consumer

In [5]:
for partition in range(6):
    path = f"partition-{partition}.json"
    if os.path.exists(path):
        os.remove(path)

In [6]:
def load_partition(partition_num):
    path = f"partition-{partition_num}.json"
    if os.path.exists(path):
        with open(path, "r") as file:
            return json.load(file)
    else:
        return {"offset": 0, "partition": partition_num}

def save_partition(partition):
    path = f"partition-{partition['partition']}.json"
    with open(path, "w") as file:
        json.dump(partition, file)

In [7]:
def consume(part_nums=[], iterations=10):
    consumer = KafkaConsumer(group_id="g1", bootstrap_servers=["kafka:9092"])
    tps = [TopicPartition("stations", part_num) for part_num in part_nums]
    consumer.assign(tps)
    consumer.seek_to_beginning()
    
    # PART 1: initialization
    partitions = {} # key=partition num, value=snapshot dict
    for i in range(len(part_nums)): 
        part_num = part_nums[i]
        mydict = load_partition(part_num)
        partitions[part_num] = mydict
        consumer.seek(tps[i], mydict["offset"])

    # PART 2: process batches
    for i in range(iterations):
        batch = consumer.poll(1000) # 1s timeout
        for topic, messages in batch.items():
            for msg in messages:
                s = Report.FromString(msg.value)
                # mydict = partitions[topic.partition]
                num = part_nums[ord(s.station)%2]
                mydict = partitions[num]
                if s.station not in mydict:
                    stadict = {"avg": 0, "count": 0, "end": s.date, "start": s.date, "sum": 0}
                else:
                    stadict = mydict[s.station]
                    if s.date <= stadict["end"]:
                        continue
                stadict["end"] = s.date
                stadict["count"] += 1
                stadict["sum"] += s.degrees
                stadict["avg"] = stadict["sum"]/stadict["count"]
                mydict[s.station] = stadict
                mydict["offset"] = consumer.position(tps[ord(s.station)%2])
                save_partition(mydict)
    print("exiting")

for i in range(2):
    print("ROUND", i)
    t1 = threading.Thread(target=consume, args=([0,1], 30))
    t2 = threading.Thread(target=consume, args=([2,3], 30))
    t3 = threading.Thread(target=consume, args=([4,5], 30))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()

ROUND 0
exiting
exiting
exiting
ROUND 1
exiting
exiting
exiting


In [8]:
!cat partition*.json

{"offset": 3, "partition": 0, "N": {"avg": 53.91339665538313, "count": 3, "end": "2000-01-03", "start": "2000-01-01", "sum": 161.7401899661494}}{"offset": 7, "partition": 1, "E": {"avg": 36.95468450176835, "count": 4, "end": "2000-01-04", "start": "2000-01-01", "sum": 147.8187380070734}, "O": {"avg": 34.333848287725004, "count": 3, "end": "2000-01-03", "start": "2000-01-01", "sum": 103.00154486317501}}{"offset": 12, "partition": 2, "D": {"avg": 15.646386582870454, "count": 4, "end": "2000-01-04", "start": "2000-01-01", "sum": 62.585546331481815}, "F": {"avg": 14.010186869221542, "count": 4, "end": "2000-01-04", "start": "2000-01-01", "sum": 56.04074747688617}, "J": {"avg": 21.305592117879247, "count": 4, "end": "2000-01-04", "start": "2000-01-01", "sum": 85.22236847151699}}{"offset": 12, "partition": 3, "G": {"avg": 16.95708444884187, "count": 4, "end": "2000-01-04", "start": "2000-01-01", "sum": 67.82833779536747}, "I": {"avg": 29.13131648095225, "count": 4, "end": "2000-01-04", "star