# Produce NY taxi trips to Kafka

## Setup

In [1]:
import os
from confluent_kafka import SerializingProducer, DeserializingConsumer
from confluent_kafka.serialization import StringSerializer, StringDeserializer
from confluent_kafka.admin import AdminClient, NewTopic
from uuid import uuid4
import sys, random
import csv, json
import time
from datetime import datetime 

In [36]:
BOOTSTRAP_SERVERS = os.environ.get('BOOTSTRAP_SERVERS')
assert BOOTSTRAP_SERVERS is not None, 'BOOTSTRAP_SERVERS must be set'


TRIP_CSV = "trip_data_12.csv"
FARE_CSV = "trip_fare_12.csv"

assert os.path.exists(TRIP_CSV), f'{TRIP_CSV} file not found'
assert os.path.exists(FARE_CSV), f'{FARE_CSV} file not found'

TRIP_BRONZE_TOPIC = 'trips_bronze'
FARE_BRONZE_TOPIC = 'fares_bronze'

PRODUCER_CONFIG = {
    'bootstrap.servers': BOOTSTRAP_SERVERS,
    'partitioner': 'murmur2_random',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer':  StringSerializer('utf_8')
}

## Utility functions

In [1]:
def get_topics():
    global BOOTSTRAP_SERVERS

    kafka_broker = {'bootstrap.servers': BOOTSTRAP_SERVERS}
    admin_client = AdminClient(kafka_broker)
    return admin_client.list_topics().topics

def delivery_report(err, msg):
    if err:
        print('Message delivery failed: {}'.format(err))

def get_timestamp():
    time_stamp = time.time()
    date_time = datetime.fromtimestamp(time_stamp)
    return date_time.strftime("%Y-%m-%dT%H:%M:%SZ")
    

def convert_to_trip(row, include_timestamp=False):
    trip = {
        "medallion": row[0],
        "hack_license": row[1],
        "vendor_id": row[2],
        "rate_code": row[3],
        "store_and_fwd_flag": row[4],
        "pickup_datetime": row[5],
        "dropoff_datetime": row[6],
        "passenger_count": row[7],
        "trip_time_in_secs": row[8],
        "trip_distance": row[9],
        "pickup_longitude": row[10],
        "pickup_latitude": row[11],
        "dropoff_longitude": row[12],
        "dropoff_latitude": row[13],
    }

    if include_timestamp:
        trip['timestamp'] = get_timestamp()

    return trip

def convert_to_fare(row, include_timestamp=False):
    time_stamp = time.time()
    date_time = datetime.fromtimestamp(time_stamp)
    str_date_time = date_time.strftime("%Y-%m-%dT%H:%M:%SZ")  # "%d-%m-%Y, %H:%M:%S"

    fare = {
        "medallion": row[0],
        "hack_license": row[1],
        "vendor_id": row[2],
        "pickup_datetime": row[3],
        "payment_type": row[4],
        "fare_amount": row[5],
        "surcharge": row[6],
        "mta_tax": row[7],
        "tip_amount": row[8],
        "tolls_amount": row[9],
        "total_amount": row[10],
    }

    if include_timestamp:
        fare['timestamp'] = str_date_time

    return fare

## Producer

In [33]:
def num_rows(csv_path):
    with open(csv_path) as f:
        reader = csv.reader(f)
        return sum(1 for _ in reader)

In [37]:
num_rows(TRIP_CSV), num_rows(FARE_CSV)

(13971119, 13971119)

In [4]:
def produce_nyc_taxi_data():
    global TRIP_CSV, FARE_CSV
    global TRIP_BRONZE_TOPIC, FARE_BRONZE_TOPIC

    producer = SerializingProducer(PRODUCER_CONFIG)
    counter = 0

    with open(TRIP_CSV) as f_trip:
        trip_reader = csv.reader(f_trip)
        with open(FARE_CSV) as f_fare:
            fare_reader = csv.reader(f_fare)

            # skip headers
            next(trip_reader)
            next(fare_reader)

            while True:
                try:
                    trip_row = next(trip_reader)
                    fare_row = next(fare_reader)

                    timestamp = get_timestamp()
                    trip = convert_to_trip(trip_row)
                    trip['timestamp'] = timestamp
                    
                    fare = convert_to_fare(fare_row)
                    fare['timestamp'] = timestamp

                    producer.produce(TRIP_BRONZE_TOPIC, value=json.dumps(trip), on_delivery=delivery_report)
                    producer.produce(FARE_BRONZE_TOPIC, value=json.dumps(fare), on_delivery=delivery_report)
                    producer.poll(0)

                    if counter % 1000 == 0:
                        producer.flush()

                    if counter % 1000_000 == 0:
                        print(f"Produced {counter} messages")
                        time.sleep(60)
                        
                    counter += 1

                except StopIteration:
                    break
    producer.flush()
    print(f"Produced {counter} messages")

In [None]:
produce_nyc_taxi_data()

In [32]:
def produce_trips(trip_csv,producer_config, topic,limit=-1, header=True):
    n = 0
    p = SerializingProducer(producer_config)
    with open(trip_csv) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        if header:
            next(csv_reader)
        try:
            for row in csv_reader:
                if n == limit:
                    break
                trip = convert_to_trip(row)
                p.poll(0)
                p.produce(topic, value=json.dumps(trip), on_delivery=delivery_report)            
                time.sleep(0.5)
                n = n + 1

            p.flush()
        except BufferError:
            sys.stderr.write('%% Local producer queue is full (%d messages awaiting delivery): try again\n' % len(p))
    

In [9]:
produce_trips(TRIP_CSV,PRODUCER_CONFIG,TRIP_BRONZE_TOPIC,limit=-1)

Produced 0 messages
Produced 100 messages
Produced 200 messages
Produced 300 messages
Produced 400 messages
Produced 500 messages
Produced 600 messages
Produced 700 messages
Produced 800 messages
Produced 900 messages
Produced 1000 messages
Produced 1100 messages
Produced 1200 messages
Produced 1300 messages
Produced 1400 messages
Produced 1500 messages
Produced 1600 messages
Produced 1700 messages
Produced 1800 messages
Produced 1900 messages
Produced 2000 messages
Produced 2100 messages
Produced 2200 messages
Produced 2300 messages
Produced 2400 messages
Produced 2500 messages
Produced 2600 messages
Produced 2700 messages
Produced 2800 messages
Produced 2900 messages
Produced 3000 messages
Produced 3100 messages
Produced 3200 messages
Produced 3300 messages
Produced 3400 messages
Produced 3500 messages
Produced 3600 messages
Produced 3700 messages
Produced 3800 messages
Produced 3900 messages
Produced 4000 messages
Produced 4100 messages
Produced 4200 messages
Produced 4300 messages


### Cleanup

In [None]:
# admin_client = AdminClient({"bootstrap.servers":BOOTSTRAP_SERVERS})
# admin_client.delete_topics(topics=[TRIP_TOPIC])