In [7]:
import pandas as pd
from sqlalchemy import create_engine
import time

In [8]:
df_preview = pd.read_csv('~/downloaded-data/yellow_tripdata_2021-01.csv', nrows=100)

In [9]:
# Convert datetime columns to datetime objects
df_preview.tpep_pickup_datetime = pd.to_datetime(df_preview.tpep_pickup_datetime)
df_preview.tpep_dropoff_datetime = pd.to_datetime(df_preview.tpep_dropoff_datetime)

In [10]:
# Create engine for postgres databased run by docker
engine = create_engine('postgresql://root:root@localhost/ny_taxi')

In [11]:
# Print DDL statement for creating a yellow trip data table
print(pd.io.sql.get_schema(df_preview, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [12]:
# use Pandas to_sql function to create a table in postgres, with the headers only
df_preview.head(n=0).to_sql('yellow_taxi_data', con=engine, if_exists='replace')

0

In [13]:
# Split df into chunks using iterator=True parameter and specifying chunks so that we will not overload the database
df_iter = pd.read_csv('~/downloaded-data/yellow_tripdata_2021-01.csv', iterator=True, chunksize=100_000)

In [14]:
for df in df_iter:
    t_start = time.time()
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.to_sql('yellow_taxi_data', con=engine, if_exists='append')
    t_end = time.time()
    print(f'Inserted chunk. Took {t_end - t_start:.2f} sec.')
print('Finished insertion')

Inserted chunk. Took 6.37 sec.
Inserted chunk. Took 6.35 sec.
Inserted chunk. Took 6.37 sec.
Inserted chunk. Took 6.39 sec.
Inserted chunk. Took 6.35 sec.
Inserted chunk. Took 6.34 sec.
Inserted chunk. Took 6.33 sec.
Inserted chunk. Took 6.32 sec.
Inserted chunk. Took 6.31 sec.
Inserted chunk. Took 6.39 sec.
Inserted chunk. Took 6.39 sec.
Inserted chunk. Took 6.40 sec.


  for df in df_iter:


Inserted chunk. Took 6.37 sec.
Inserted chunk. Took 3.88 sec.
Finished insertion
