In [0]:
from kafka import KafkaProducer, KafkaConsumer
from kafka.errors import KafkaError
import json

producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Load the JSON file
json_file_path = "../data/random_events.json"

with open(json_file_path, "r") as file:
    data = json.load(file)

for message in data:
    future = producer.send('test-topic', message)

    try:
        record_metadata = future.get(timeout=10)
        print(f'Message sent: {record_metadata.topic}, partition: {record_metadata.partition}, offset: {record_metadata.offset}')
    except KafkaError as e:
        print(f'Error sending message: {e}')

producer.close()

In [0]:
import pandas as pd
from collections import defaultdict

consumer = KafkaConsumer(
    'test-topic', # the topic to consume messages from
    bootstrap_servers=['localhost:9092'], # list of kafka brokers to connecto to
    auto_offset_reset='earliest', # where to start reading messages when no offset is stored (earliest is from the beginning)
    enable_auto_commit=True, # automatically commit offsets after consuming messages
    value_deserializer=lambda x:  x.decode('utf-8') if x else None  # deserialize message values from bytes to utf-8 strings
)

aggregated_data = defaultdict(lambda: {"likes": 0, "comments": 0, "shares": 0})
batch_size = 50
batch_count = 0

for message in consumer:
    raw_value = message.value

    if raw_value is None:
        print("Received an empty message. Skipping...")
        continue

    try:
        print(f"Raw message: {raw_value}")

        event = json.loads(raw_value) if isinstance(raw_value, str) else raw_value


        post_id = event.get("post_id")
        platform = event.get("platform")
        post_timestamp = event.get("timestamp")
        event_type = event.get("event_type")

        # Aggregate event counts
        key = (post_id, post_timestamp, platform)

        if event_type == "like":
            aggregated_data[key]["likes"] += 1
        elif event_type == "comment":
            aggregated_data[key]["comments"] += 1
        elif event_type == "share":
            aggregated_data[key]["shares"] += 1

        batch_count += 1

        # Save to CSV in batches
        if batch_count >= batch_size:
            df = pd.DataFrame([
                {"post_id": k[0], "post_timestamp": k[1], "platform": k[2], **v}
                for k, v in aggregated_data.items()
            ])

            csv_file = "social_media_engagement_bronze.csv"
            df.to_csv(csv_file, index=False)

            print(f"Updated CSV saved: {csv_file}")

            # Reset batch counter
            batch_count = 0

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}, Raw message: {raw_value}")

    except Exception as e:
        print(f"Unexpected error: {e}, Raw message: {raw_value}")