# Lab 8: Data Loading Simulation
This notebook simulates **three data loading methods** into DuckDB:

- ✅ Append (no duplicates)
- ✅ Truncate and reload
- ✅ Incremental (insert or update)

It loads CPI data from the Philadelphia Fed API and tests all methods over a simulated daily range.

In [11]:
!pip3 install duckdb pandas openpyxl requests

1414.58s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [12]:
import duckdb
import openpyxl
import requests
import load_inc
import pandas as pd
from datetime import datetime, timedelta
import time
import os
import load_append
import load_trunc


usage: ipykernel_launcher.py [-h] --pull_date PULL_DATE
ipykernel_launcher.py: error: the following arguments are required: --pull_date


SystemExit: 2

In [None]:
# === Simulation Configuration ===
start_date = datetime(2004, 1, 1)
end_date = datetime(2004, 3, 1)  # You can change this for longer tests

# Initialize timing lists
append_times = []
trunc_times = []
inc_times = []


In [None]:
# === Run Loaders Over Date Range ===
current_date = start_date
while current_date <= end_date:
    pull_date_str = current_date.strftime("%Y-%m-%d")
    print(f"\n📅 Running loaders for: {pull_date_str}")

    try:
        t0 = time.time()
        load_append.run(pull_date_str)
        append_times.append(time.time() - t0)
    except Exception as e:
        print(f"[append error] {pull_date_str}: {e}")

    try:
        t0 = time.time()
        load_trunc.run(pull_date_str)
        trunc_times.append(time.time() - t0)
    except Exception as e:
        print(f"[trunc error] {pull_date_str}: {e}")

    try:
        t0 = time.time()
        load_inc.run(pull_date_str)
        inc_times.append(time.time() - t0)
    except Exception as e:
        print(f"[incremental error] {pull_date_str}: {e}")

    current_date += timedelta(days=1)


In [None]:
# === Preview Data from Each Table ===
def view_table(db_path, table_name):
    con = duckdb.connect(db_path)
    df = con.execute(f"SELECT * FROM {table_name} ORDER BY date LIMIT 10").fetchdf()
    con.close()
    return df

print("\n📊 Sample from Append Table:")
display(view_table("economic_data_append.duckdb", "economic_data_append"))

print("\n📊 Sample from Trunc Table:")
display(view_table("economic_data_trunc.duckdb", "economic_data_trunc"))

print("\n📊 Sample from Incremental Table:")
display(view_table("economic_data_inc.duckdb", "economic_data_inc"))


In [None]:
# === Compare Method Performance ===
def summarize_times(label, times):
    avg = round(sum(times) / len(times), 4)
    return f"{label} avg: {avg:.4f}s over {len(times)} runs"

print("\n⏱️ Performance Summary:")
print(summarize_times("Append", append_times))
print(summarize_times("Trunc", trunc_times))
print(summarize_times("Incremental", inc_times))


In [None]:
# === Check for Duplicates in Append Table ===
con = duckdb.connect("economic_data_append.duckdb")
dup_check = con.execute("""
    SELECT date, COUNT(*) as count
    FROM economic_data_append
    GROUP BY date
    HAVING COUNT(*) > 1
""").fetchdf()
con.close()

if dup_check.empty:
    print("\n✅ No duplicates in append table.")
else:
    print("\n⚠️ Duplicates found in append table:")
    display(dup_check)
