# Planilha Maia Query

This notebook queries the `gold.planilha_maia` table to investigate the results of the column mapping and data transformation.

## Table Queried:
- **Huntington Data Lake** (huntington_data_lake.duckdb):
  - gold.planilha_maia (mapped columns from gold.planilha_embryoscope_combined)


In [None]:
import duckdb
import pandas as pd
import os
from datetime import datetime

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Database Connection

In [None]:
def get_huntington_connection():
    """Get connection to huntington_data_lake database"""
    # Adjust path assuming notebook is in explore/ subdirectory
    huntington_db_path = os.path.join('..', '..', 'database', 'huntington_data_lake.duckdb')
    return duckdb.connect(huntington_db_path, read_only=True)

## Query Functions

In [None]:
def get_table_info(conn):
    """Get basic statistics about the planilha_maia table"""
    print(f"\n=== TABLE INFORMATION ===")
    
    # Row count
    try:
        row_count = conn.execute("SELECT COUNT(*) FROM gold.planilha_maia").fetchone()[0]
        print(f"Total rows: {row_count:,}")
        
        # Column count and names
        col_info = conn.execute("DESCRIBE gold.planilha_maia").df()
        print(f"Total columns: {len(col_info)}")
        print(f"\nColumn names:")
        for i, col in enumerate(col_info['column_name'], 1):
            print(f"  {i:2d}. {col}")
        
        # Check for NULL values in each column
        print(f"\nNULL value counts:")
        for col in col_info['column_name']:
            null_count = conn.execute(f'SELECT COUNT(*) FROM gold.planilha_maia WHERE "{col}" IS NULL').fetchone()[0]
            null_pct = (null_count / row_count * 100) if row_count > 0 else 0
            print(f"  {col:35s}: {null_count:6,} ({null_pct:5.1f}%)")
            
        return col_info
    except Exception as e:
        print(f"Error getting table info: {e}")
        return None

In [None]:
def query_sample_data(conn, limit=100):
    """Query sample data from gold.planilha_maia"""
    print(f"\n=== SAMPLE DATA (LIMIT {limit}) ===")
    query = f"""
    SELECT * FROM gold.planilha_maia
    LIMIT {limit}
    """
    
    df = conn.execute(query).df()
    print(f"Showing first {len(df)} records")
    return df

## Execution

In [None]:
conn = get_huntington_connection()
try:
    get_table_info(conn)
    df_sample = query_sample_data(conn, limit=100)
    display(df_sample)
finally:
    conn.close()