# Produce NY taxi trips to Kafka

## Setup

In [16]:
import os
from confluent_kafka import SerializingProducer, DeserializingConsumer
from confluent_kafka.serialization import StringSerializer, StringDeserializer
from confluent_kafka.admin import AdminClient, NewTopic
from uuid import uuid4
import sys, random
import csv, json
import time
from datetime import datetime 

In [17]:
BOOTSTRAP_SERVERS = os.environ.get('BOOTSTRAP_SERVERS')
assert BOOTSTRAP_SERVERS is not None, 'BOOTSTRAP_SERVERS must be set'


TRIP_CSV = "trip_data_12.csv"
FARE_CSV = "trip_fare_12.csv"

assert os.path.exists(TRIP_CSV), f'{TRIP_CSV} file not found'
assert os.path.exists(FARE_CSV), f'{FARE_CSV} file not found'

TRIP_BRONZE_TOPIC = 'trips_bronze'
FARE_BRONZE_TOPIC = 'fares_bronze'

PRODUCER_CONFIG = {
    'bootstrap.servers': BOOTSTRAP_SERVERS,
    'partitioner': 'murmur2_random',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer':  StringSerializer('utf_8')
}

## Utility functions

In [33]:
def get_topics():
    global BOOTSTRAP_SERVERS

    kafka_broker = {"bootstrap.servers": BOOTSTRAP_SERVERS}
    admin_client = AdminClient(kafka_broker)
    return admin_client.list_topics().topics


def delivery_report(err, msg):
    if err:
        print("Message delivery failed: {}".format(err))


def get_timestamp(before_secs=0, marker=None):
    time_stamp = time.time() if marker is None else marker
    time_stamp = time_stamp - before_secs
    date_time = datetime.fromtimestamp(time_stamp)
    return date_time.strftime("%Y-%m-%dT%H:%M:%SZ")


def convert_to_trip(row, include_timestamp=False):
    trip = {
        "medallion": row[0],
        "hack_license": row[1],
        "vendor_id": row[2],
        "rate_code": row[3],
        "store_and_fwd_flag": row[4],
        "pickup_datetime": row[5],
        "dropoff_datetime": row[6],
        "passenger_count": row[7],
        "trip_time_in_secs": row[8],
        "trip_distance": row[9],
        "pickup_longitude": row[10],
        "pickup_latitude": row[11],
        "dropoff_longitude": row[12],
        "dropoff_latitude": row[13],
    }

    if include_timestamp:
        trip["timestamp"] = get_timestamp()

    return trip


def convert_to_fare(row, include_timestamp=False):
    time_stamp = time.time()
    date_time = datetime.fromtimestamp(time_stamp)
    str_date_time = date_time.strftime("%Y-%m-%dT%H:%M:%SZ")  # "%d-%m-%Y, %H:%M:%S"

    fare = {
        "medallion": row[0],
        "hack_license": row[1],
        "vendor_id": row[2],
        "pickup_datetime": row[3],
        "payment_type": row[4],
        "fare_amount": row[5],
        "surcharge": row[6],
        "mta_tax": row[7],
        "tip_amount": row[8],
        "tolls_amount": row[9],
        "total_amount": row[10],
    }

    if include_timestamp:
        fare["timestamp"] = str_date_time

    return fare

## Producer

In [19]:
def num_rows(csv_path):
    with open(csv_path) as f:
        reader = csv.reader(f)
        return sum(1 for _ in reader)

In [37]:
num_rows(TRIP_CSV), num_rows(FARE_CSV)

(13971119, 13971119)

In [41]:
def produce_nyc_taxi_data(start_row=0, num_messages=2000_000):
    """
    Produces messages to trip and fare topics
    - start_from: int, starting row number
    - num_messages: int, number of messages to produce
    """
    global TRIP_CSV, FARE_CSV
    global TRIP_BRONZE_TOPIC, FARE_BRONZE_TOPIC

    producer = SerializingProducer(PRODUCER_CONFIG)
    counter = start_row
    limit = start_row + num_messages
    marker_timestamp = time.time()

    with open(TRIP_CSV) as f_trip:
        trip_reader = csv.reader(f_trip)
        with open(FARE_CSV) as f_fare:
            fare_reader = csv.reader(f_fare)

            # skip headers
            next(trip_reader)
            next(fare_reader)

            while True:
                try:
                    trip_row = next(trip_reader)
                    fare_row = next(fare_reader)

                    before_secs = (limit - counter) * 7
                    before_secs = max(1, before_secs)
                    timestamp = get_timestamp(before_secs, marker_timestamp)

                    trip = convert_to_trip(trip_row)
                    trip['timestamp'] = timestamp
                    
                    fare = convert_to_fare(fare_row)
                    fare['timestamp'] = timestamp

                    producer.produce(TRIP_BRONZE_TOPIC, value=json.dumps(trip), on_delivery=delivery_report)
                    producer.produce(FARE_BRONZE_TOPIC, value=json.dumps(fare), on_delivery=delivery_report)
                    producer.poll(0)

                    if counter % 1000 == 0:
                        producer.flush()

                    counter += 1
                    if counter >= limit:
                        break

                except StopIteration:
                    break
    producer.flush()
    print(f"Produced {counter} messages")

In [None]:
produce_nyc_taxi_data()

: 

### Cleanup

In [None]:
# admin_client = AdminClient({"bootstrap.servers":BOOTSTRAP_SERVERS})
# admin_client.delete_topics(topics=[""])