In [1]:
import json
import time
import pandas as pd
from confluent_kafka import SerializingProducer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.schema_registry.json_schema import JSONSerializer

In [2]:
trip_schema = """
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "UberTripEvent",
  "type": "object",
  "properties": {
    "VendorID": {"type": ["integer", "null"]},
    "tpep_pickup_datetime": {"type": ["string", "null"]},
    "tpep_dropoff_datetime": {"type": ["string", "null"]},
    "passenger_count": {"type": ["integer", "null"]},
    "trip_distance": {"type": ["number", "null"]},
    "RatecodeID": {"type": ["integer", "null"]},
    "store_and_fwd_flag": {"type": ["string", "null"]},
    "PULocationID": {"type": ["integer", "null"]},
    "DOLocationID": {"type": ["integer", "null"]},
    "payment_type": {"type": ["integer", "null"]},
    "fare_amount": {"type": ["number", "null"]},
    "extra": {"type": ["number", "null"]},
    "mta_tax": {"type": ["number", "null"]},
    "tip_amount": {"type": ["number", "null"]},
    "tolls_amount": {"type": ["number", "null"]},
    "improvement_surcharge": {"type": ["number", "null"]},
    "total_amount": {"type": ["number", "null"]},
    "congestion_surcharge": {"type": ["number", "null"]},
    "Airport_fee": {"type": ["number", "null"]},
    "cbd_congestion_fee": {"type": ["number", "null"]}
  },
  "required": ["tpep_pickup_datetime", "tpep_dropoff_datetime"]
}
"""

In [3]:

bootstrap_servers = 'localhost:9092'
schema_registry_url = 'http://localhost:8081'

schema_registry_conf = {'url': schema_registry_url}
schema_registry_client = SchemaRegistryClient(schema_registry_conf)

json_serializer = JSONSerializer(trip_schema, schema_registry_client)

producer_conf = {
    'bootstrap.servers': bootstrap_servers,
    'key.serializer': StringSerializer('utf-8'),
    'value.serializer': json_serializer
}

producer = SerializingProducer(producer_conf)

In [5]:
df = pd.read_parquet(r'D:\Just Data\Uber Real-Time Analytics Pipeline\yellow_tripdata_2025-01.parquet')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0


In [6]:
record = df.iloc[0].to_dict()
record

{'VendorID': 1,
 'tpep_pickup_datetime': Timestamp('2025-01-01 00:18:38'),
 'tpep_dropoff_datetime': Timestamp('2025-01-01 00:26:59'),
 'passenger_count': 1.0,
 'trip_distance': 1.6,
 'RatecodeID': 1.0,
 'store_and_fwd_flag': 'N',
 'PULocationID': 229,
 'DOLocationID': 237,
 'payment_type': 1,
 'fare_amount': 10.0,
 'extra': 3.5,
 'mta_tax': 0.5,
 'tip_amount': 3.0,
 'tolls_amount': 0.0,
 'improvement_surcharge': 1.0,
 'total_amount': 18.0,
 'congestion_surcharge': 2.5,
 'Airport_fee': 0.0,
 'cbd_congestion_fee': 0.0}

In [None]:
topic = 'uber_trips'

def delivery_report(err, msg):
    if err is not None: 
        print(f"Delivery failed for record {msg.key()}: {err}")
    else:
        print(f"Record {msg.key()} successfully produced to {msg.topic()} [{msg.partition()}]")

for i, row in df.iterows(): 
    record = row.to_dict()

    producer.produce(topic = topic, key = str(i), value = record, on_delivery = delivery_report)   

    time.sleep(1)

    if i%10 == 0:
        producer.flush()

producer.flush()