# Data Ingestion Validation

This notebook validates the data ingestion process by:
- Loading data from DuckDB
- Checking row counts per table
- Verifying date ranges
- Checking for missing values
- Displaying sample records
- Basic statistics (min/max dates, counts by month)

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set visualization defaults
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Connect to DuckDB
db_path = Path('../data/nyc_mobility.duckdb')
conn = duckdb.connect(str(db_path))

print(f"Connected to DuckDB at: {db_path}")
print(f"Database exists: {db_path.exists()}")

## 1. Check Available Tables

First, let's see what tables were created by DLT.

In [None]:
# Get list of all tables
tables_df = conn.execute("""
    SELECT table_schema, table_name 
    FROM information_schema.tables 
    WHERE table_schema = 'raw_data'
    ORDER BY table_name
""").df()

print("Available tables in raw_data schema:")
print(tables_df)

## 2. Row Counts Per Table

Check how many records were loaded into each table.

In [None]:
# Get row counts for each table
row_counts = {}

# Yellow Taxi
try:
    count = conn.execute("SELECT COUNT(*) FROM raw_data.yellow_taxi").fetchone()[0]
    row_counts['Yellow Taxi'] = f"{count:,}"
except:
    row_counts['Yellow Taxi'] = "Table not found"

# FHV Taxi
try:
    count = conn.execute("SELECT COUNT(*) FROM raw_data.fhv_taxi").fetchone()[0]
    row_counts['FHV Taxi'] = f"{count:,}"
except:
    row_counts['FHV Taxi'] = "Table not found"

# CitiBike
try:
    count = conn.execute("SELECT COUNT(*) FROM raw_data.trips").fetchone()[0]
    row_counts['CitiBike Trips'] = f"{count:,}"
except:
    row_counts['CitiBike Trips'] = "Table not found"

# Weather
try:
    count = conn.execute("SELECT COUNT(*) FROM raw_data.hourly_weather").fetchone()[0]
    row_counts['Hourly Weather'] = f"{count:,}"
except:
    row_counts['Hourly Weather'] = "Table not found"

print("\nRow Counts:")
for table, count in row_counts.items():
    print(f"  {table}: {count}")

## 3. Yellow Taxi Data Validation

In [None]:
# Check Yellow Taxi schema
yellow_schema = conn.execute("""
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'raw_data' AND table_name = 'yellow_taxi'
    ORDER BY ordinal_position
""").df()

print("Yellow Taxi Schema:")
print(yellow_schema)

# Sample records
print("\nSample Yellow Taxi Records:")
yellow_sample = conn.execute("SELECT * FROM raw_data.yellow_taxi LIMIT 5").df()
display(yellow_sample)

In [None]:
# Date range and statistics
yellow_stats = conn.execute("""
    SELECT 
        MIN(tpep_pickup_datetime) as min_date,
        MAX(tpep_pickup_datetime) as max_date,
        COUNT(*) as total_trips,
        AVG(trip_distance) as avg_distance,
        AVG(total_amount) as avg_fare
    FROM raw_data.yellow_taxi
""").df()

print("Yellow Taxi Statistics:")
display(yellow_stats)

# Monthly breakdown
yellow_monthly = conn.execute("""
    SELECT 
        EXTRACT(YEAR FROM tpep_pickup_datetime) as year,
        EXTRACT(MONTH FROM tpep_pickup_datetime) as month,
        COUNT(*) as trip_count
    FROM raw_data.yellow_taxi
    GROUP BY year, month
    ORDER BY year, month
""").df()

print("\nYellow Taxi Monthly Breakdown:")
display(yellow_monthly)

## 4. FHV Taxi Data Validation

In [None]:
# Check FHV schema
fhv_schema = conn.execute("""
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'raw_data' AND table_name = 'fhv_taxi'
    ORDER BY ordinal_position
""").df()

print("FHV Taxi Schema:")
print(fhv_schema)

# Sample records
print("\nSample FHV Records:")
fhv_sample = conn.execute("SELECT * FROM raw_data.fhv_taxi LIMIT 5").df()
display(fhv_sample)

In [None]:
# Date range and statistics
fhv_stats = conn.execute("""
    SELECT 
        MIN(pickup_datetime) as min_date,
        MAX(pickup_datetime) as max_date,
        COUNT(*) as total_trips
    FROM raw_data.fhv_taxi
""").df()

print("FHV Taxi Statistics:")
display(fhv_stats)

# Monthly breakdown
fhv_monthly = conn.execute("""
    SELECT 
        EXTRACT(YEAR FROM pickup_datetime) as year,
        EXTRACT(MONTH FROM pickup_datetime) as month,
        COUNT(*) as trip_count
    FROM raw_data.fhv_taxi
    GROUP BY year, month
    ORDER BY year, month
""").df()

print("\nFHV Taxi Monthly Breakdown:")
display(fhv_monthly)

## 5. CitiBike Data Validation

In [None]:
# Check CitiBike schema
citibike_schema = conn.execute("""
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'raw_data' AND table_name = 'trips'
    ORDER BY ordinal_position
""").df()

print("CitiBike Schema:")
print(citibike_schema)

# Sample records
print("\nSample CitiBike Records:")
citibike_sample = conn.execute("SELECT * FROM raw_data.trips LIMIT 5").df()
display(citibike_sample)

In [None]:
# Date range and statistics (note: CitiBike column names vary by year)
# Try common column names for start time
try:
    citibike_stats = conn.execute("""
        SELECT 
            MIN(started_at) as min_date,
            MAX(started_at) as max_date,
            COUNT(*) as total_trips
        FROM raw_data.trips
    """).df()
    date_col = 'started_at'
except:
    try:
        citibike_stats = conn.execute("""
            SELECT 
                MIN(starttime) as min_date,
                MAX(starttime) as max_date,
                COUNT(*) as total_trips
            FROM raw_data.trips
        """).df()
        date_col = 'starttime'
    except:
        print("Unable to determine date column")
        citibike_stats = None
        date_col = None

if citibike_stats is not None:
    print("CitiBike Statistics:")
    display(citibike_stats)
    
    # Monthly breakdown
    citibike_monthly = conn.execute(f"""
        SELECT 
            EXTRACT(YEAR FROM {date_col}) as year,
            EXTRACT(MONTH FROM {date_col}) as month,
            COUNT(*) as trip_count
        FROM raw_data.trips
        GROUP BY year, month
        ORDER BY year, month
    """).df()
    
    print("\nCitiBike Monthly Breakdown:")
    display(citibike_monthly)

## 6. Weather Data Validation

In [None]:
# Check Weather schema
weather_schema = conn.execute("""
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'raw_data' AND table_name = 'hourly_weather'
    ORDER BY ordinal_position
""").df()

print("Weather Schema:")
print(weather_schema)

# Sample records
print("\nSample Weather Records:")
weather_sample = conn.execute("SELECT * FROM raw_data.hourly_weather LIMIT 5").df()
display(weather_sample)

In [None]:
# Date range and statistics
weather_stats = conn.execute("""
    SELECT 
        MIN(timestamp) as min_date,
        MAX(timestamp) as max_date,
        COUNT(*) as total_records,
        AVG(temp) as avg_temp_celsius,
        AVG(humidity) as avg_humidity,
        AVG(wind_speed) as avg_wind_speed
    FROM raw_data.hourly_weather
""").df()

print("Weather Statistics:")
display(weather_stats)

# Daily breakdown
weather_daily = conn.execute("""
    SELECT 
        DATE(timestamp) as date,
        COUNT(*) as hourly_records,
        AVG(temp) as avg_temp,
        MIN(temp) as min_temp,
        MAX(temp) as max_temp
    FROM raw_data.hourly_weather
    GROUP BY date
    ORDER BY date
    LIMIT 10
""").df()

print("\nWeather Daily Breakdown (first 10 days):")
display(weather_daily)

## 7. DLT Metadata Tables

Check DLT's metadata tables to see load information.

In [None]:
# Check DLT loads
try:
    dlt_loads = conn.execute("""
        SELECT * FROM raw_data._dlt_loads 
        ORDER BY inserted_at DESC 
        LIMIT 10
    """).df()
    
    print("Recent DLT Loads:")
    display(dlt_loads)
except:
    print("DLT metadata table not found - data may not have been loaded yet")

## 8. Data Completeness Summary

Expected data for Q4 2023 (Oct-Dec):
- **Yellow Taxi**: ~3M trips (Oct-Dec 2023)
- **FHV**: ~15M trips (Oct-Dec 2023)
- **CitiBike**: ~1.5M trips (Oct-Dec 2023)
- **Weather**: 2,208 hourly records (92 days × 24 hours)

In [None]:
print("\n" + "="*80)
print("DATA INGESTION VALIDATION SUMMARY")
print("="*80)

for table, count in row_counts.items():
    print(f"  ✓ {table}: {count} records")

print("\n" + "="*80)
print("Next Steps:")
print("  1. Review data quality in 02_data_quality_assessment.ipynb")
print("  2. Perform exploratory analysis in 03_exploratory_analysis.ipynb")
print("="*80)

In [None]:
# Close connection
conn.close()
print("\nConnection closed.")