# Database Table Explorer - Marketing Analytics Pipeline

This notebook displays details of all tables across the pipeline databases:
- **staging.db** - Derived tables (cleaned and standardized source data)
- **warehouse.db** - Dimension and fact tables (dimensional model)
- **business.db** - Business analysis tables (insights and analytics)
- **metadata.db** - Pipeline metadata (execution logs and data quality)

In [1]:
import sqlite3
import pandas as pd
import os
from IPython.display import display, HTML

In [2]:
# Setup database paths
base_path = os.path.dirname(os.getcwd())  # Go up one level from notebooks
databases = {
    'staging': os.path.join(base_path, 'data', 'staging.db'),
    'warehouse': os.path.join(base_path, 'data', 'warehouse.db'), 
    'business': os.path.join(base_path, 'data', 'business.db'),
    'metadata': os.path.join(base_path, 'data', 'metadata.db')
}

print("Database paths:")
for name, path in databases.items():
    exists = "EXISTS" if os.path.exists(path) else "NOT FOUND"
    print(f"  {name}: {path} ({exists})")

Database paths:
  staging: /Users/ashwindhanasamy/Documents/cave/HEC/marketing-analytics-pipeline/data/staging.db (EXISTS)
  warehouse: /Users/ashwindhanasamy/Documents/cave/HEC/marketing-analytics-pipeline/data/warehouse.db (EXISTS)
  business: /Users/ashwindhanasamy/Documents/cave/HEC/marketing-analytics-pipeline/data/business.db (EXISTS)
  metadata: /Users/ashwindhanasamy/Documents/cave/HEC/marketing-analytics-pipeline/data/metadata.db (EXISTS)


In [3]:
def connect_to_database(db_name):
    """Connect to specified database"""
    db_path = databases.get(db_name)
    if db_path and os.path.exists(db_path):
        return sqlite3.connect(db_path)
    return None

def get_tables(conn):
    """Get list of tables in database"""
    cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
    return [row[0] for row in cursor.fetchall()]

def get_table_info(conn, table_name):
    """Get table schema and row count"""
    # Get column information
    cursor = conn.execute(f"PRAGMA table_info({table_name})")
    columns = cursor.fetchall()
    
    # Get row count
    cursor = conn.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    
    return columns, row_count

def get_sample_data(conn, table_name, limit=5):
    """Get sample data from table"""
    try:
        return pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT {limit}", conn)
    except Exception as e:
        return pd.DataFrame({'Error': [str(e)]})

## Staging Database (Derived Tables)
Cleaned and standardized source data

In [4]:
conn = connect_to_database('staging')
if conn:
    tables = get_tables(conn)
    print(f"Tables in staging database: {len(tables)}")
    
    for table in tables:
        print(f"\n--- Table: {table} ---")
        columns, row_count = get_table_info(conn, table)
        print(f"Rows: {row_count:,}")
        
        print("Columns:")
        for col in columns:
            pk = " (PRIMARY KEY)" if col[5] else ""
            print(f"  {col[1]} ({col[2]}){pk}")
        
        if row_count > 0:
            print("\nSample data:")
            sample = get_sample_data(conn, table)
            display(sample)
    
    conn.close()
else:
    print("Staging database not found")

Tables in staging database: 2

--- Table: stg_sales_cleaned ---
Rows: 40,000
Columns:
  date_parsed (DATE)
  customer_id (INTEGER)
  order_id (INTEGER)
  sales_amount (REAL)
  load_timestamp (TIMESTAMP)
  data_quality_flag (TEXT)

Sample data:


Unnamed: 0,date_parsed,customer_id,order_id,sales_amount,load_timestamp,data_quality_flag
0,2021-01-01,990787,1,167.72,2025-09-05 20:13:08,VALID
1,2021-01-01,284871,34,164.37,2025-09-05 20:13:08,VALID
2,2021-01-01,194576,35,125.67,2025-09-05 20:13:08,VALID
3,2021-01-01,992731,36,142.25,2025-09-05 20:13:08,VALID
4,2021-01-01,434276,37,121.01,2025-09-05 20:13:08,VALID



--- Table: stg_sales_raw ---
Rows: 40,000
Columns:
  unnamed_0 (INTEGER)
  date_raw (TEXT)
  customer_id (INTEGER)
  order_id (INTEGER)
  sales (REAL)
  load_timestamp (TIMESTAMP)
  source_file (TEXT)

Sample data:


Unnamed: 0,unnamed_0,date_raw,customer_id,order_id,sales,load_timestamp,source_file
0,0,2021-01-01,990787,1,167.72,2025-09-05 16:13:08.768627,HEC_testing_data_sample_2_.csv
1,33,2021-01-01,284871,34,164.37,2025-09-05 16:13:08.768627,HEC_testing_data_sample_2_.csv
2,34,2021-01-01,194576,35,125.67,2025-09-05 16:13:08.768627,HEC_testing_data_sample_2_.csv
3,35,2021-01-01,992731,36,142.25,2025-09-05 16:13:08.768627,HEC_testing_data_sample_2_.csv
4,36,2021-01-01,434276,37,121.01,2025-09-05 16:13:08.768627,HEC_testing_data_sample_2_.csv


## Warehouse Database (Dimension & Fact Tables)
Dimensional model optimized for analytics

In [5]:
conn = connect_to_database('warehouse')
if conn:
    tables = get_tables(conn)
    print(f"Tables in warehouse database: {len(tables)}")
    
    # Separate dimension and fact tables
    dim_tables = [t for t in tables if t.startswith('dim_')]
    fact_tables = [t for t in tables if t.startswith('fact_')]
    other_tables = [t for t in tables if not (t.startswith('dim_') or t.startswith('fact_'))]
    
    # Display dimension tables
    if dim_tables:
        print("\n=== DIMENSION TABLES ===")
        for table in dim_tables:
            print(f"\n--- Table: {table} ---")
            columns, row_count = get_table_info(conn, table)
            print(f"Rows: {row_count:,}")
            
            print("Columns:")
            for col in columns:
                pk = " (PRIMARY KEY)" if col[5] else ""
                print(f"  {col[1]} ({col[2]}){pk}")
            
            if row_count > 0:
                print("\nSample data:")
                sample = get_sample_data(conn, table)
                display(sample)
    
    # Display fact tables
    if fact_tables:
        print("\n=== FACT TABLES ===")
        for table in fact_tables:
            print(f"\n--- Table: {table} ---")
            columns, row_count = get_table_info(conn, table)
            print(f"Rows: {row_count:,}")
            
            print("Columns:")
            for col in columns:
                pk = " (PRIMARY KEY)" if col[5] else ""
                print(f"  {col[1]} ({col[2]}){pk}")
            
            if row_count > 0:
                print("\nSample data:")
                sample = get_sample_data(conn, table)
                display(sample)
    
    # Display other tables
    if other_tables:
        print("\n=== OTHER TABLES ===")
        for table in other_tables:
            print(f"\n--- Table: {table} ---")
            columns, row_count = get_table_info(conn, table)
            print(f"Rows: {row_count:,}")
            
            print("Columns:")
            for col in columns:
                pk = " (PRIMARY KEY)" if col[5] else ""
                print(f"  {col[1]} ({col[2]}){pk}")
            
            if row_count > 0:
                print("\nSample data:")
                sample = get_sample_data(conn, table)
                display(sample)
    
    conn.close()
else:
    print("Warehouse database not found")

Tables in warehouse database: 4

=== DIMENSION TABLES ===

--- Table: dim_date ---
Rows: 685
Columns:
  date_id (INTEGER) (PRIMARY KEY)
  full_date (DATE)
  year (INTEGER)
  quarter (INTEGER)
  month (INTEGER)
  month_name (TEXT)
  month_abbr (TEXT)
  week_of_year (INTEGER)
  day_of_year (INTEGER)
  day_of_month (INTEGER)
  day_of_week (INTEGER)
  day_name (TEXT)
  day_abbr (TEXT)
  is_weekend (INTEGER)
  is_month_start (INTEGER)
  is_month_end (INTEGER)
  is_quarter_start (INTEGER)
  is_quarter_end (INTEGER)
  is_year_start (INTEGER)
  is_year_end (INTEGER)
  date_string (TEXT)
  month_year (TEXT)
  quarter_year (TEXT)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,date_id,full_date,year,quarter,month,month_name,month_abbr,week_of_year,day_of_year,day_of_month,...,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,date_string,month_year,quarter_year,created_at
0,20210101,2021-01-01,2021,1,1,January,Jan,0,1,1,...,1,0,1,0,1,0,2021-01-01,,Q1,2025-09-05 20:13:08
1,20210102,2021-01-02,2021,1,1,January,Jan,0,2,2,...,0,0,0,0,0,0,2021-01-02,,Q1,2025-09-05 20:13:08
2,20210103,2021-01-03,2021,1,1,January,Jan,0,3,3,...,0,0,0,0,0,0,2021-01-03,,Q1,2025-09-05 20:13:08
3,20210104,2021-01-04,2021,1,1,January,Jan,1,4,4,...,0,0,0,0,0,0,2021-01-04,,Q1,2025-09-05 20:13:08
4,20210105,2021-01-05,2021,1,1,January,Jan,1,5,5,...,0,0,0,0,0,0,2021-01-05,,Q1,2025-09-05 20:13:08



--- Table: dim_customer ---
Rows: 33,477
Columns:
  customer_id (INTEGER) (PRIMARY KEY)
  first_order_date (DATE)
  last_order_date (DATE)
  total_transactions (INTEGER)
  total_spent (REAL)
  avg_order_value (REAL)
  total_orders (INTEGER)
  first_order_cohort_month (TEXT)
  first_order_cohort_quarter (TEXT)
  first_order_cohort_year (INTEGER)
  days_since_first_order (INTEGER)
  customer_vintage_group (TEXT)
  days_since_last_order (INTEGER)
  customer_segment (TEXT)
  customer_status (TEXT)
  created_at (TIMESTAMP)
  updated_at (TIMESTAMP)

Sample data:


Unnamed: 0,customer_id,first_order_date,last_order_date,total_transactions,total_spent,avg_order_value,total_orders,first_order_cohort_month,first_order_cohort_quarter,first_order_cohort_year,days_since_first_order,customer_vintage_group,days_since_last_order,customer_segment,customer_status,created_at,updated_at
0,100001,2021-02-03,2022-11-01,2,484.83,242.415,2,2021-02,2021-Q1,2021,1675.842465,1039.84246527776,365+ days,VIP At Risk,Inactive,2025-09-05 20:13:09,2025-09-05 20:13:09
1,100041,2021-09-24,2021-09-24,1,229.22,229.22,1,2021-09,2021-Q3,2021,1442.842465,1442.84246527776,365+ days,VIP At Risk,Inactive,2025-09-05 20:13:09,2025-09-05 20:13:09
2,100155,2021-06-01,2021-06-01,1,55.86,55.86,1,2021-06,2021-Q2,2021,1557.842465,1557.84246527776,365+ days,One-Time Buyer,Inactive,2025-09-05 20:13:09,2025-09-05 20:13:09
3,100178,2021-05-15,2021-05-15,1,88.17,88.17,1,2021-05,2021-Q2,2021,1574.842465,1574.84246527776,365+ days,One-Time Buyer,Inactive,2025-09-05 20:13:09,2025-09-05 20:13:09
4,100184,2021-05-25,2021-11-26,2,370.02,185.01,2,2021-05,2021-Q2,2021,1564.842465,1379.84246527776,365+ days,VIP At Risk,Inactive,2025-09-05 20:13:09,2025-09-05 20:13:09



--- Table: dim_order ---
Rows: 40,000
Columns:
  order_id (INTEGER) (PRIMARY KEY)
  customer_id (INTEGER)
  order_date (DATE)
  date_id (INTEGER)
  order_amount (REAL)
  order_year (INTEGER)
  order_month (INTEGER)
  order_quarter (INTEGER)
  order_day_of_week (INTEGER)
  order_day_name (TEXT)
  is_weekend_order (INTEGER)
  customer_order_sequence (INTEGER)
  is_first_order (INTEGER)
  days_since_customer_first_order (INTEGER)
  days_since_previous_order (INTEGER)
  order_amount_quartile (TEXT)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,order_id,customer_id,order_date,date_id,order_amount,order_year,order_month,order_quarter,order_day_of_week,order_day_name,is_weekend_order,customer_order_sequence,is_first_order,days_since_customer_first_order,days_since_previous_order,order_amount_quartile,created_at
0,1,990787,2021-01-01,20210101,167.72,2021,1,1,6,,0,1,1,0,,Medium-High,2025-09-05 20:13:09
1,2,988913,2021-01-01,20210101,217.93,2021,1,1,6,,0,1,1,0,,High,2025-09-05 20:13:09
2,3,361999,2021-01-01,20210101,124.45,2021,1,1,6,,0,1,1,0,,Medium-Low,2025-09-05 20:13:09
3,4,283625,2021-01-01,20210101,190.51,2021,1,1,6,,0,1,1,0,,Medium-High,2025-09-05 20:13:09
4,5,253640,2021-01-01,20210101,72.64,2021,1,1,6,,0,1,1,0,,Low,2025-09-05 20:13:09



=== FACT TABLES ===

--- Table: fact_sales ---
Rows: 40,000
Columns:
  customer_id (INTEGER) (PRIMARY KEY)
  order_id (INTEGER) (PRIMARY KEY)
  date_id (INTEGER) (PRIMARY KEY)
  sales_amount (REAL)
  transaction_count (INTEGER)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,customer_id,order_id,date_id,sales_amount,transaction_count,created_at
0,113769,13,20210101,110.12,1,2025-09-05 20:13:09
1,137819,27,20210101,236.52,1,2025-09-05 20:13:09
2,140153,53,20210101,319.42,1,2025-09-05 20:13:09
3,147613,52,20210101,115.02,1,2025-09-05 20:13:09
4,158237,7,20210101,87.72,1,2025-09-05 20:13:09


## Business Database (Business Analysis Tables)
Business insights, analytics, and derived metrics

In [6]:
conn = connect_to_database('business')
if conn:
    tables = get_tables(conn)
    print(f"Tables in business database: {len(tables)}")
    
    for table in tables:
        print(f"\n--- Table: {table} ---")
        columns, row_count = get_table_info(conn, table)
        print(f"Rows: {row_count:,}")
        
        print("Columns:")
        for col in columns:
            pk = " (PRIMARY KEY)" if col[5] else ""
            print(f"  {col[1]} ({col[2]}){pk}")
        
        if row_count > 0:
            print("\nSample data:")
            sample = get_sample_data(conn, table)
            display(sample)
    
    conn.close()
else:
    print("Business database not found")

Tables in business database: 9

--- Table: monthly_metrics ---
Rows: 23
Columns:
  period_month (TEXT) (PRIMARY KEY)
  total_sales (REAL)
  avg_order_value (REAL)
  total_transactions (INTEGER)
  total_orders (INTEGER)
  unique_customers (INTEGER)
  purchase_frequency (REAL)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,period_month,total_sales,avg_order_value,total_transactions,total_orders,unique_customers,purchase_frequency,created_at
0,2021-01,279395.61,164.157233,1702,1702,1688,1.01,2025-09-05 20:13:09
1,2021-02,256331.7,165.37529,1550,1550,1542,1.01,2025-09-05 20:13:09
2,2021-03,278361.89,163.549877,1702,1702,1689,1.01,2025-09-05 20:13:09
3,2021-04,281264.62,165.644653,1698,1698,1684,1.01,2025-09-05 20:13:09
4,2021-05,299902.93,171.471086,1749,1749,1729,1.01,2025-09-05 20:13:09



--- Table: cohort_analysis ---
Rows: 276
Columns:
  cohort_month (TEXT) (PRIMARY KEY)
  activity_month (TEXT) (PRIMARY KEY)
  months_since_acquisition (INTEGER)
  cohort_size (INTEGER)
  active_customers (INTEGER)
  retention_rate_percent (REAL)
  total_sales (REAL)
  avg_order_value (REAL)

Sample data:


Unnamed: 0,cohort_month,activity_month,months_since_acquisition,cohort_size,active_customers,retention_rate_percent,total_sales,avg_order_value
0,2021-01,2021-01,0,1688,1688,100.0,279395.61,164.157233
1,2021-01,2021-02,1,1688,28,1.66,3842.04,137.215714
2,2021-01,2021-03,2,1688,36,2.13,5909.74,159.722703
3,2021-01,2021-04,3,1688,22,1.3,3154.96,143.407273
4,2021-01,2021-05,4,1688,32,1.9,5815.24,176.219394



--- Table: customer_ltv_analysis ---
Rows: 33,477
Columns:
  customer_id (INTEGER) (PRIMARY KEY)
  acquisition_cohort (TEXT)
  customer_segment (TEXT)
  total_orders (INTEGER)
  total_spent (REAL)
  avg_order_value (REAL)
  days_active (INTEGER)
  predicted_ltv_score (INTEGER)
  churn_risk_score (REAL)

Sample data:


Unnamed: 0,customer_id,acquisition_cohort,customer_segment,total_orders,total_spent,avg_order_value,days_active,predicted_ltv_score,churn_risk_score
0,100001,2021-02,VIP At Risk,2,484.83,242.415,1675.842465,1,0.9
1,100041,2021-09,VIP At Risk,1,229.22,229.22,1442.842465,1,0.9
2,100155,2021-06,One-Time Buyer,1,55.86,55.86,1557.842465,1,0.9
3,100178,2021-05,One-Time Buyer,1,88.17,88.17,1574.842465,1,0.9
4,100184,2021-05,VIP At Risk,2,370.02,185.01,1564.842465,1,0.9



--- Table: campaign_targets ---
Rows: 28,753
Columns:
  customer_id (INTEGER) (PRIMARY KEY)
  campaign_type (TEXT) (PRIMARY KEY)
  priority_level (INTEGER)
  estimated_value (REAL)
  days_since_last_order (INTEGER)
  recommended_action (TEXT)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,customer_id,campaign_type,priority_level,estimated_value,days_since_last_order,recommended_action,created_at
0,491584,Long-term Win-back,5,1067.66,365+ days,Final 25% discount attempt,2025-09-05 20:13:32
1,726309,Long-term Win-back,5,972.53,365+ days,Final 25% discount attempt,2025-09-05 20:13:32
2,394105,Long-term Win-back,5,947.46,365+ days,Final 25% discount attempt,2025-09-05 20:13:32
3,203058,Long-term Win-back,5,867.26,365+ days,Final 25% discount attempt,2025-09-05 20:13:32
4,346390,Long-term Win-back,5,815.0,365+ days,Final 25% discount attempt,2025-09-05 20:13:32



--- Table: business_insights ---
Rows: 6
Columns:
  insight_id (TEXT) (PRIMARY KEY)
  insight_type (TEXT)
  insight_title (TEXT)
  insight_description (TEXT)
  metric_value (REAL)
  recommendation (TEXT)
  priority_level (INTEGER)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,insight_id,insight_type,insight_title,insight_description,metric_value,recommendation,priority_level,created_at
0,CONV_001,CONVERSION,Customer Conversion Rate,"Out of 33,477 customers, 28,753 (85.89%) are o...",85.89,Implement automated email sequences to convert...,1,2025-09-05 20:13:32
1,COH_001,COHORT,Best Performing Cohort,Cohort 2021-07 has the highest month-1 retenti...,2.02,Analyze and replicate the acquisition strategi...,2,2025-09-05 20:13:32
2,RET_001,RETENTION,Best Retention Cohort Performance,Cohort 2021-01 shows strongest retention: 3m=1...,100.0,Analyze acquisition channels and onboarding fo...,1,2025-09-05 20:13:32
3,RISK_001,CHURN_RISK,High-Value Customers at Risk,"235 high-LTV customers are at risk, representi...",235.0,Immediate intervention with personalized offer...,1,2025-09-05 20:13:32
4,SEG_CAN,SEGMENTATION,Cannot Lose Them Segment Analysis,"16,527 customers (49.4%) in Cannot Lose Them s...",49.4,Focus on cannot lose them with targeted campaigns,1,2025-09-05 20:13:32



--- Table: customer_lifecycle_snapshot ---
Rows: 1
Columns:
  snapshot_date (DATE) (PRIMARY KEY)
  lifecycle_stage (TEXT) (PRIMARY KEY)
  customers (INTEGER)
  share_of_base (REAL)
  avg_days_since_last_order (REAL)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,snapshot_date,lifecycle_stage,customers,share_of_base,avg_days_since_last_order,created_at
0,2022-11-16,Inactive,33477,1.0,365.0,2025-09-05 20:13:32



--- Table: cumulative_retention_analysis ---
Rows: 69
Columns:
  cohort_month (TEXT) (PRIMARY KEY)
  retention_window_months (INTEGER) (PRIMARY KEY)
  cohort_size (INTEGER)
  active_customers (INTEGER)
  cumulative_retention_rate (REAL)
  avg_purchase_frequency (REAL)
  total_revenue (REAL)
  avg_customer_value (REAL)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,cohort_month,retention_window_months,cohort_size,active_customers,cumulative_retention_rate,avg_purchase_frequency,total_revenue,avg_customer_value,created_at
0,2021-01,3,1688,1688,100.0,1.06,292302.35,173.16,2025-09-05 20:13:09
1,2021-02,3,1514,1514,100.0,1.06,266411.17,175.97,2025-09-05 20:13:09
2,2021-03,3,1630,1630,100.0,1.06,284563.9,174.58,2025-09-05 20:13:09
3,2021-04,3,1607,1607,100.0,1.06,280935.68,174.82,2025-09-05 20:13:09
4,2021-05,3,1625,1625,100.0,1.05,290916.64,179.03,2025-09-05 20:13:09



--- Table: customer_segmentation ---
Rows: 33,477
Columns:
  customer_id (INTEGER) (PRIMARY KEY)
  recency_score (INTEGER)
  frequency_score (INTEGER)
  monetary_score (INTEGER)
  rfm_segment (TEXT)
  segment_description (TEXT)
  recommended_strategy (TEXT)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,customer_id,recency_score,frequency_score,monetary_score,rfm_segment,segment_description,recommended_strategy,created_at
0,100001,5,2,5,New Customers,Recent customers with potential,"Onboarding campaigns, product education",2025-09-05 20:13:32
1,100041,1,1,4,Cannot Lose Them,High-value customers at risk of churning,"Immediate intervention, VIP treatment",2025-09-05 20:13:32
2,100155,1,1,1,Lost Customers,Customers who haven't purchased in long time,Win-back campaigns with strong incentives,2025-09-05 20:13:32
3,100178,1,1,1,Lost Customers,Customers who haven't purchased in long time,Win-back campaigns with strong incentives,2025-09-05 20:13:32
4,100184,1,2,5,Cannot Lose Them,High-value customers at risk of churning,"Immediate intervention, VIP treatment",2025-09-05 20:13:32



--- Table: seasonal_trends ---
Rows: 12
Columns:
  period_type (TEXT) (PRIMARY KEY)
  period_value (TEXT) (PRIMARY KEY)
  avg_sales (REAL)
  avg_orders (INTEGER)
  avg_customers (INTEGER)
  seasonal_index (REAL)
  trend_direction (TEXT)
  created_at (TIMESTAMP)

Sample data:


Unnamed: 0,period_type,period_value,avg_sales,avg_orders,avg_customers,seasonal_index,trend_direction,created_at
0,monthly,1,293816.97,1771,1752,1.007,stable,2025-09-05 20:13:32
1,monthly,2,268193.16,1597,1586,0.919,negative,2025-09-05 20:13:32
2,monthly,3,289289.96,1749,1735,0.992,stable,2025-09-05 20:13:32
3,monthly,4,289618.86,1750,1731,0.993,stable,2025-09-05 20:13:32
4,monthly,5,313485.53,1833,1812,1.075,positive,2025-09-05 20:13:32


## Metadata Database (Pipeline Metadata)
Pipeline execution logs and data quality tracking

In [7]:
conn = connect_to_database('metadata')
if conn:
    tables = get_tables(conn)
    print(f"Tables in metadata database: {len(tables)}")
    
    for table in tables:
        print(f"\n--- Table: {table} ---")
        columns, row_count = get_table_info(conn, table)
        print(f"Rows: {row_count:,}")
        
        print("Columns:")
        for col in columns:
            pk = " (PRIMARY KEY)" if col[5] else ""
            print(f"  {col[1]} ({col[2]}){pk}")
        
        if row_count > 0:
            print("\nSample data:")
            sample = get_sample_data(conn, table)
            display(sample)
    
    conn.close()
else:
    print("Metadata database not found")

Tables in metadata database: 3

--- Table: pipeline_runs ---
Rows: 432
Columns:
  run_id (TEXT) (PRIMARY KEY)
  layer (TEXT)
  table_name (TEXT)
  status (TEXT)
  start_time (TIMESTAMP)
  end_time (TIMESTAMP)
  row_count (INTEGER)
  error_message (TEXT)

Sample data:


Unnamed: 0,run_id,layer,table_name,status,start_time,end_time,row_count,error_message
0,STAGING_stg_sales_raw_20250905_073224_743576,STAGING,stg_sales_raw,STARTED,2025-09-05 07:32:24.743583,,,
1,STAGING_stg_sales_cleaned_20250905_073224_828677,STAGING,stg_sales_cleaned,STARTED,2025-09-05 07:32:24.828680,,,
2,WAREHOUSE_dim_date_20250905_073224_860627,WAREHOUSE,dim_date,STARTED,2025-09-05 07:32:24.860633,,,
3,WAREHOUSE_dim_customer_20250905_073224_974817,WAREHOUSE,dim_customer,STARTED,2025-09-05 07:32:24.974822,,,
4,WAREHOUSE_dim_order_20250905_073225_084694,WAREHOUSE,dim_order,STARTED,2025-09-05 07:32:25.084703,,,



--- Table: data_quality_checks ---
Rows: 688
Columns:
  check_id (TEXT) (PRIMARY KEY)
  run_id (TEXT)
  table_name (TEXT)
  check_type (TEXT)
  check_name (TEXT)
  expected_value (TEXT)
  actual_value (TEXT)
  status (TEXT)
  error_details (TEXT)
  check_time (TIMESTAMP)

Sample data:


Unnamed: 0,check_id,run_id,table_name,check_type,check_name,expected_value,actual_value,status,error_details,check_time
0,STAGING_stg_sales_raw_20250905_073224_743576_m...,STAGING_stg_sales_raw_20250905_073224_743576,stg_sales_raw,ROW_COUNT,min_rows_check,1000,40000,PASSED,,2025-09-05 07:32:24.813365
1,STAGING_stg_sales_raw_20250905_073224_743576_c...,STAGING_stg_sales_raw_20250905_073224_743576,stg_sales_raw,NULL_CHECK,customer_id_null_check,<=0.0%,0.00%,PASSED,,2025-09-05 07:32:24.814761
2,STAGING_stg_sales_raw_20250905_073224_743576_o...,STAGING_stg_sales_raw_20250905_073224_743576,stg_sales_raw,NULL_CHECK,order_id_null_check,<=0.0%,0.00%,PASSED,,2025-09-05 07:32:24.816345
3,STAGING_stg_sales_raw_20250905_073224_743576_s...,STAGING_stg_sales_raw_20250905_073224_743576,stg_sales_raw,NULL_CHECK,sales_null_check,<=0.0%,0.00%,PASSED,,2025-09-05 07:32:24.817772
4,STAGING_stg_sales_raw_20250905_073224_743576_c...,STAGING_stg_sales_raw_20250905_073224_743576,stg_sales_raw,UNIQUENESS,customer_id_unique_check,>=1000,33477,PASSED,,2025-09-05 07:32:24.828170



--- Table: table_lineage ---
Rows: 0
Columns:
  lineage_id (TEXT) (PRIMARY KEY)
  source_table (TEXT)
  target_table (TEXT)
  transformation_type (TEXT)
  created_at (TIMESTAMP)


## Database Summary
Overview of all tables and row counts

In [8]:
summary_data = []
total_rows = 0

for db_name, db_path in databases.items():
    if os.path.exists(db_path):
        conn = sqlite3.connect(db_path)
        tables = get_tables(conn)
        
        for table in tables:
            columns, row_count = get_table_info(conn, table)
            summary_data.append({
                'Database': db_name,
                'Table': table,
                'Rows': row_count,
                'Columns': len(columns)
            })
            total_rows += row_count
        
        conn.close()

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    print(f"Total tables: {len(summary_df)}")
    print(f"Total rows across all tables: {total_rows:,}")
    print("\nDetailed summary:")
    display(summary_df)
else:
    print("No tables found in any database")

Total tables: 18
Total rows across all tables: 291,376

Detailed summary:


Unnamed: 0,Database,Table,Rows,Columns
0,staging,stg_sales_cleaned,40000,6
1,staging,stg_sales_raw,40000,7
2,warehouse,dim_date,685,24
3,warehouse,dim_customer,33477,17
4,warehouse,dim_order,40000,17
5,warehouse,fact_sales,40000,6
6,business,monthly_metrics,23,8
7,business,cohort_analysis,276,8
8,business,customer_ltv_analysis,33477,9
9,business,campaign_targets,28753,7
