In [1]:
import sqlite3
import pandas as pd
import time
from urllib.parse import urlencode

In [2]:
# Constants
BASE_URL = "https://data.cityofnewyork.us/resource/h9gi-nx95.csv"
DB_PATH = "nyc_crashes.db"
TABLE_NAME = "crashes"
BATCH_SIZE = 50000

In [3]:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

total_fetched = 0
batch_num = 0

In [None]:
while True:
    offset = batch_num * BATCH_SIZE
    print(f"\nFetching batch {batch_num + 1}, records {offset + 1} to {offset + BATCH_SIZE} ...")

    params = {
        "$order": "crash_date DESC",
        "$limit": BATCH_SIZE,
        "$offset": offset
    }
    query_string = urlencode(params)
    url = f"{BASE_URL}?{query_string}"

    try:
        df = pd.read_csv(
            url,
            parse_dates=["crash_date"],
            dtype={"crash_time": str, "zip_code": str}
        )
        if df.empty:
            print("No more data to fetch, exiting.")
            break
            
        # Normalize column names: lowercase and replace spaces with underscores
        df.columns = [col.lower().replace(" ", "_") for col in df.columns]

        if batch_num == 0:
            # Create table if not exists based on this batch's schema
            columns_with_types = []
            for col, dtype in df.dtypes.items():
                if pd.api.types.is_integer_dtype(dtype):
                    col_type = "INTEGER"
                elif pd.api.types.is_float_dtype(dtype):
                    col_type = "REAL"
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    col_type = "TEXT"
                else:
                    col_type = "TEXT"
                columns_with_types.append(f"{col} {col_type}")

            create_table_sql = f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} ({', '.join(columns_with_types)});"
            cursor.execute(create_table_sql)
            conn.commit()

            df.to_sql(TABLE_NAME, conn, if_exists="append", index=False)
            print(f"Inserted first {len(df)} records.")
        else:
            df.to_sql(TABLE_NAME, conn, if_exists="append", index=False)
            print(f"Inserted {len(df)} records.")

        total_fetched += len(df)
        batch_num += 1

        time.sleep(1)  # Be polite to the API

    except pd.errors.EmptyDataError:
        print("No more data to fetch, exiting.")
        break
    except Exception as e:
        print(f"Error fetching batch at offset {offset}: {e}")
        break


Fetching batch 5, records 200001 to 250000 ...
Inserted 50000 records.

Fetching batch 6, records 250001 to 300000 ...
Inserted 50000 records.

Fetching batch 7, records 300001 to 350000 ...
Inserted 50000 records.

Fetching batch 8, records 350001 to 400000 ...
Inserted 50000 records.

Fetching batch 9, records 400001 to 450000 ...
Inserted 50000 records.

Fetching batch 10, records 450001 to 500000 ...
Inserted 50000 records.

Fetching batch 11, records 500001 to 550000 ...
Inserted 50000 records.

Fetching batch 12, records 550001 to 600000 ...
Inserted 50000 records.

Fetching batch 13, records 600001 to 650000 ...
Inserted 50000 records.

Fetching batch 14, records 650001 to 700000 ...
Inserted 50000 records.

Fetching batch 15, records 700001 to 750000 ...
Inserted 50000 records.

Fetching batch 16, records 750001 to 800000 ...
Inserted 50000 records.

Fetching batch 17, records 800001 to 850000 ...
Inserted 50000 records.

Fetching batch 18, records 850001 to 900000 ...
Inserte

In [None]:
print(f"\nFinished. Total records inserted: {total_fetched}")
conn.close()