In [1]:
import sqlite3
import pandas as pd
import time
from urllib.parse import urlencode

In [2]:
# Constants
BASE_URL = "https://data.cityofnewyork.us/resource/h9gi-nx95.csv"
DB_PATH = "nyc_crashes.db"
TABLE_NAME = "crashes"
BATCH_SIZE = 50000

In [51]:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

total_fetched = 0
batch_num = 0

In [52]:
# Get max crash_date already in DB (or None if empty)
cursor.execute(f"SELECT MAX(crash_date) FROM {TABLE_NAME}")
max_date_in_db = cursor.fetchone()[0]
if max_date_in_db:
    max_date_in_db = pd.to_datetime(max_date_in_db).date()
print(f"Max crash_date in DB: {max_date_in_db}")

while True:
    offset = batch_num * BATCH_SIZE
    print(f"\nFetching batch {batch_num + 1}, records {offset + 1} to {offset + BATCH_SIZE} ...")

    params = {
        "$order": "crash_date DESC",
        "$limit": BATCH_SIZE,
        "$offset": offset
    }
    query_string = urlencode(params)
    url = f"{BASE_URL}?{query_string}"

    try:
        df = pd.read_csv(
            url,
            parse_dates=["crash_date"],
            dtype={"crash_time": str, "zip_code": str}
        )
        if df.empty:
            print("No more data to fetch, exiting.")
            break
            
        # Normalize column names: lowercase and replace spaces with underscores
        df.columns = [col.lower().replace(" ", "_") for col in df.columns]

        # Check batch max date vs DB max date
        batch_max_date = df["crash_date"].max().date()
        print(f"Batch max crash_date: {batch_max_date}")

        if max_date_in_db and batch_max_date <= max_date_in_db:
            print("Reached data already in database. Stopping fetch.")
            break
            
        if batch_num == 0:
            # Create table if not exists based on this batch's schema
            columns_with_types = []
            for col, dtype in df.dtypes.items():
                if pd.api.types.is_integer_dtype(dtype):
                    col_type = "INTEGER"
                elif pd.api.types.is_float_dtype(dtype):
                    col_type = "REAL"
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    col_type = "TEXT"
                else:
                    col_type = "TEXT"
                columns_with_types.append(f"{col} {col_type}")

            create_table_sql = f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} ({', '.join(columns_with_types)});"
            cursor.execute(create_table_sql)
            conn.commit()

            df.to_sql(TABLE_NAME, conn, if_exists="append", index=False)
            print(f"Inserted first {len(df)} records.")
        else:
            df.to_sql(TABLE_NAME, conn, if_exists="append", index=False)
            print(f"Inserted {len(df)} records.")

        total_fetched += len(df)
        batch_num += 1

        time.sleep(1)  # Be polite to the API

    except pd.errors.EmptyDataError:
        print("No more data to fetch, exiting.")
        break
    except Exception as e:
        print(f"Error fetching batch at offset {offset}: {e}")
        break

Max crash_date in DB: 2023-03-27

Fetching batch 1, records 1 to 50000 ...
Batch max crash_date: 2025-05-30
Inserted first 50000 records.

Fetching batch 2, records 50001 to 100000 ...
Batch max crash_date: 2024-10-25
Inserted 50000 records.

Fetching batch 3, records 100001 to 150000 ...
Batch max crash_date: 2024-04-10
Inserted 50000 records.

Fetching batch 4, records 150001 to 200000 ...
Batch max crash_date: 2023-09-28
Inserted 50000 records.

Fetching batch 5, records 200001 to 250000 ...
Batch max crash_date: 2023-03-27
Reached data already in database. Stopping fetch.


In [53]:
print(f"\nFinished. Total records inserted: {total_fetched}")
conn.close()


Finished. Total records inserted: 200000
