# Check Pandas Installation

In [2]:
import pandas as pd

print(pandas.__version__)

2.2.1


# Load the NYC Taxi Data

https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz

Only a few (say 100)

In [3]:
df = pd.read_csv('green_tripdata_2019-10.csv', nrows=100)
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1,112,196,1,5.88,18.0,0.50,0.5,0.00,0.0,,0.3,19.30,2,1,0.0
1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1,43,263,1,0.80,5.0,3.25,0.5,0.00,0.0,,0.3,9.05,2,1,0.0
2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1,255,228,2,7.50,21.5,0.50,0.5,0.00,0.0,,0.3,22.80,2,1,0.0
3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1,181,181,1,0.90,5.5,0.50,0.5,0.00,0.0,,0.3,6.80,2,1,0.0
4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1,97,188,1,2.52,10.0,0.50,0.5,2.26,0.0,,0.3,13.56,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2019-10-01 00:02:53,2019-10-01 00:14:32,N,1,126,74,1,3.10,12.0,0.50,0.5,0.00,0.0,,0.3,13.30,1,1,0.0
96,2,2019-10-01 00:18:45,2019-10-01 00:29:23,N,1,42,74,1,1.64,9.5,0.50,0.5,0.00,0.0,,0.3,10.80,2,1,0.0
97,2,2019-10-01 00:41:32,2019-10-01 00:52:51,N,1,75,42,1,3.17,11.5,0.50,0.5,1.50,0.0,,0.3,14.30,1,1,0.0
98,2,2019-10-01 00:36:54,2019-10-01 00:54:20,N,1,92,179,1,5.48,19.5,0.50,0.5,0.00,0.0,,0.3,20.80,2,1,0.0


In [4]:
# handling date & time with pandas
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

# Connect to your DB using SqlAlchemy

In [8]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")

In [10]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x15547fe10>

In [11]:
# DDL 
print(pd.io.sql.get_schema(df, name='green_trip_data', con=engine))


CREATE TABLE green_trip_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




# Load the Data to Database

In [49]:
# create an iterator using pandas
df_iter = pd.read_csv('green_tripdata_2019-10.csv', iterator=True, chunksize=100000)

In [50]:
print(df_iter.engine)

c


In [51]:
# next iteration can be obtained using these lines
df = next(df_iter)
df.shape

(100000, 20)

In [52]:
# Schema creation - using the heading alone
df.head(n=0)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge


In [53]:
df.head(n=0).to_sql(name='green_taxi_data', con=engine, if_exists='replace')

0

In [54]:
%time df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

CPU times: user 2.73 s, sys: 172 ms, total: 2.91 s
Wall time: 4.89 s


1000

In [16]:
# Verify the schema

In [55]:
query01 = """
SELECT * FROM green_taxi_data LIMIT 10;
"""

pd.read_sql(query01, con=engine)

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1,112,196,1,5.88,...,0.5,0.5,0.0,0.0,,0.3,19.3,2,1.0,0.0
1,1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1,43,263,1,0.8,...,3.25,0.5,0.0,0.0,,0.3,9.05,2,1.0,0.0
2,2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1,255,228,2,7.5,...,0.5,0.5,0.0,0.0,,0.3,22.8,2,1.0,0.0
3,3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1,181,181,1,0.9,...,0.5,0.5,0.0,0.0,,0.3,6.8,2,1.0,0.0
4,4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1,97,188,1,2.52,...,0.5,0.5,2.26,0.0,,0.3,13.56,1,1.0,0.0
5,5,2,2019-10-01 00:35:01,2019-10-01 00:43:40,N,1,65,49,1,1.47,...,0.5,0.5,1.86,0.0,,0.3,11.16,1,1.0,0.0
6,6,1,2019-10-01 00:28:09,2019-10-01 00:30:49,N,1,7,179,1,0.6,...,0.5,0.5,1.0,0.0,,0.3,6.3,1,1.0,0.0
7,7,2,2019-10-01 00:28:26,2019-10-01 00:32:01,N,1,41,74,1,0.56,...,0.5,0.5,0.0,0.0,,0.3,5.8,2,1.0,0.0
8,8,2,2019-10-01 00:14:01,2019-10-01 00:26:16,N,1,255,49,1,2.42,...,0.5,0.5,0.0,0.0,,0.3,11.8,2,1.0,0.0
9,9,1,2019-10-01 00:03:03,2019-10-01 00:17:13,Y,1,130,131,1,3.4,...,0.5,0.5,2.85,0.0,,0.3,17.15,1,1.0,0.0


In [56]:
from time import time

while True:
    try:
        t_start = time()
        df = next(df_iter)
        
        # handling date & time with pandas
        df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
        df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
        
        df.to_sql(name='green_taxi_data', con=engine, if_exists='append')
        t_end = time()
        print('Inserted chunk in...', t_end-t_start)
    except StopIteration:
        print("Finished ingesting data into the database")
        break

Inserted chunk in... 5.406062126159668
Inserted chunk in... 5.4556379318237305


  df = next(df_iter)


Inserted chunk in... 5.329927206039429
Inserted chunk in... 3.258514165878296
Finished ingesting data into the database


In [57]:
query01 = """
SELECT * FROM green_taxi_data;
"""

pd.read_sql(query01, con=engine)

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,0,2.0,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1.0,112,196,1.0,5.88,...,0.50,0.5,0.00,0.00,,0.3,19.30,2.0,1.0,0.0
1,1,1.0,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1.0,43,263,1.0,0.80,...,3.25,0.5,0.00,0.00,,0.3,9.05,2.0,1.0,0.0
2,2,1.0,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1.0,255,228,2.0,7.50,...,0.50,0.5,0.00,0.00,,0.3,22.80,2.0,1.0,0.0
3,3,1.0,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1.0,181,181,1.0,0.90,...,0.50,0.5,0.00,0.00,,0.3,6.80,2.0,1.0,0.0
4,4,2.0,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1.0,97,188,1.0,2.52,...,0.50,0.5,2.26,0.00,,0.3,13.56,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476381,476381,,2019-10-31 23:30:00,2019-11-01 00:00:00,,,65,102,,7.04,...,2.75,0.5,0.00,0.00,,0.0,32.82,,,
476382,476382,,2019-10-31 23:03:00,2019-10-31 23:24:00,,,129,136,,0.00,...,2.75,0.5,0.00,6.12,,0.0,49.20,,,
476383,476383,,2019-10-31 23:02:00,2019-10-31 23:23:00,,,61,222,,3.90,...,2.75,0.5,0.00,0.00,,0.0,26.36,,,
476384,476384,,2019-10-31 23:42:00,2019-10-31 23:56:00,,,76,39,,3.08,...,2.75,0.5,0.00,0.00,,0.0,18.48,,,


In [58]:
query02 = """
SELECT COUNT(*) FROM green_taxi_data LIMIT 10;
"""

pd.read_sql(query02, con=engine)

Unnamed: 0,count
0,476386
