In [1]:
from pathlib import Path
from datetime import datetime
from collections import namedtuple
import pandas as pd
import pytz

In [2]:
UTC = pytz.UTC
EST = pytz.timezone('US/Eastern')

## Read in Yellow Taxi Trip

Ref: http://www.nyc.gov/html/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [3]:
YELLOW_COLS = [
    'vendor_id', 'pickup_datetime', 'dropoff_datetime',
    'passenger_count', 'trip_distance',
    'pickup_lon', 'pickup_lat',
    'rate_code_id', 'store_and_fwd_flag',
    'dropoff_lon', 'dropoff_lat',
    'payment_type', 'fare_amount',
    'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
    'improvement_surcharge', 'total_amount'
]


def read_yellow_trip(csv_pth, chunksize=100000):
    df_reader = pd.read_csv(
        csv_pth, 
        header=0, index_col=False, usecols=range(19),
        chunksize=chunksize, parse_dates=False,
        names=YELLOW_COLS
    )
    for df in df_reader:
        yield df

In [4]:
def make_nyc_dt(dt_str):
    dt = UTC.normalize(EST.localize(datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S')))
    return dt

In [5]:
import psycopg2
from pgcopy import CopyManager, Replace
from io import BytesIO, StringIO

In [6]:
conn = psycopg2.connect(database='liang-bo.wang_project1')

In [None]:
%%time
mgr = CopyManager(conn, 'taxi_trips', ['taxi_type', *YELLOW_COLS])

for df in read_yellow_trip('../raw_trip_data/yellow_tripdata_2016-06.csv'):
    records = []
    for trip in df.itertuples(index=False, name='YellowTaxiTrip'):
        trip = trip._replace(
            pickup_datetime=make_nyc_dt(trip.pickup_datetime),
            dropoff_datetime=make_nyc_dt(trip.dropoff_datetime),
            store_and_fwd_flag=trip.store_and_fwd_flag.encode(),
        )
        records.append((b'YELLOW', *trip))
    with conn:
        mgr.copy(records, BytesIO)

In [None]:
conn.close()