In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
import sqlite3
from sqlite3 import Error
from datetime import date, datetime, timezone, timedelta
from dateutil.parser import parse

import os
import glob
import re
import csv

In [2]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [3]:
connection = create_connection("NYC_YellowCab_data.sqlite3")

In [4]:
qry_tbl = '''
CREATE TABLE IF NOT EXISTS nyc_tripdata (
      VendorID INTEGER
    , tpep_pickup_datetime TIMESTAMP
    , tpep_dropoff_datetime TIMESTAMP
    , passenger_count INTEGER
    , trip_distance REAL
    , RatecodeID INTEGER
    , store_and_fwd_flag TEXT
    , PULocationID INTEGER
    , DOLocationID INTEGER
    , payment_type INTEGER
    , fare_amount REAL
    , extra REAL
    , mta_tax REAL
    , tip_amount REAL
    , tolls_amount REAL
    , improvement_surcharge REAL
    , total_amount REAL
    , congestion_surcharge REAL
);
'''

cursor = connection.cursor()
cursor.execute(qry_tbl)

<sqlite3.Cursor at 0x29a0bf820a0>

In [12]:
for trip_file in glob.glob(r'[0-9][0-9][0-9][0-9]_Yellow_Taxi_Trip_Data.csv'):
    print(f"Processing file {trip_file}")
    fh = open(trip_file, encoding='utf-8')
    txt_records = csv.reader(fh)
    txt_records.__next__()

    sql_insert_records = "INSERT INTO nyc_tripdata (VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
    cursor = connection.executemany(sql_insert_records, txt_records)
    print(f"Inserted {cursor.rowcount} rows")
    connection.commit()

Processing file 2019_Yellow_Taxi_Trip_Data.csv
Inserted 84399019 rows
Processing file 2020_Yellow_Taxi_Trip_Data.csv
Inserted 24648499 rows


In [13]:
qry_tbl2 = '''
CREATE TABLE IF NOT EXISTS taxi_zone_lookup (
      LocationID INTEGER
    , Borough TEXT
    , Zone TEXT
    , service_zone TEXT
);
'''

cursor = connection.cursor()
cursor.execute(qry_tbl2)

<sqlite3.Cursor at 0x29a0bf828f0>

In [14]:
zone_file = "taxi_zone_lookup.csv"

fh = open(zone_file, encoding='utf-8')
print(f"Processing file {zone_file}")
txt_records = csv.reader(fh)
txt_records.__next__()

sql_insert_records = "INSERT INTO taxi_zone_lookup (LocationID, Borough, Zone, service_zone) VALUES(?, ?, ?, ?)"
cursor = connection.executemany(sql_insert_records, txt_records)
print(f"Inserted {cursor.rowcount} rows")
connection.commit()

Processing file taxi_zone_lookup.csv
Inserted 265 rows


In [15]:
connection.close()