# Data Ploidia Query

This notebook queries the `gold.data_ploidia` table to investigate the results of the column mapping and filtering.

## Table Queried:
- **Huntington Data Lake** (huntington_data_lake.duckdb):
  - gold.data_ploidia (mapped and filtered columns from gold.planilha_embryoscope_combined)


In [101]:
import duckdb
import pandas as pd
import os
from datetime import datetime

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


## Database Connection


In [102]:
def get_huntington_connection():
    """Get connection to huntington_data_lake database"""
    huntington_db_path = os.path.join('..', '..', 'database', 'huntington_data_lake.duckdb')
    return duckdb.connect(huntington_db_path, read_only=True)


## Query Functions


In [103]:
def query_data_ploidia_all(conn):
    """Query all data from gold.data_ploidia"""
    print(f"\n=== GOLD.DATA_PLOIDIA (ALL DATA) ===")
    query = """
    SELECT * FROM gold.data_ploidia
    ORDER BY "Patient ID", "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records in gold.data_ploidia")
    return df

def query_data_ploidia_by_patient(conn, patient_id):
    """Query gold.data_ploidia for a specific patient ID"""
    print(f"\n=== GOLD.DATA_PLOIDIA (PATIENT ID: {patient_id}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Patient ID" = {patient_id}
    ORDER BY "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Patient ID {patient_id}")
    return df

def query_data_ploidia_by_embryo(conn, embryo_id):
    """Query gold.data_ploidia for a specific embryo ID"""
    print(f"\n=== GOLD.DATA_PLOIDIA (EMBRYO ID: {embryo_id}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Embryo ID" = '{embryo_id}' OR "Video ID" = '{embryo_id}'
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Embryo ID {embryo_id}")
    return df

def query_data_ploidia_by_unidade(conn, unidade):
    """Query gold.data_ploidia for a specific unidade"""
    print(f"\n=== GOLD.DATA_PLOIDIA (UNIDADE: {unidade}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Unidade" = '{unidade}'
    ORDER BY "Patient ID", "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Unidade {unidade}")
    return df

def get_table_info(conn):
    """Get basic statistics about the data_ploidia table"""
    print(f"\n=== TABLE INFORMATION ===")
    
    # Row count
    row_count = conn.execute("SELECT COUNT(*) FROM gold.data_ploidia").fetchone()[0]
    print(f"Total rows: {row_count:,}")
    
    # Column count and names
    col_info = conn.execute("DESCRIBE gold.data_ploidia").df()
    print(f"Total columns: {len(col_info)}")
    print(f"\nColumn names:")
    for i, col in enumerate(col_info['column_name'], 1):
        print(f"  {i:2d}. {col}")
    
    # Check for NULL values in each column
    print(f"\nNULL value counts:")
    for col in col_info['column_name']:
        null_count = conn.execute(f'SELECT COUNT(*) FROM gold.data_ploidia WHERE "{col}" IS NULL').fetchone()[0]
        null_pct = (null_count / row_count * 100) if row_count > 0 else 0
        print(f"  {col:35s}: {null_count:6,} ({null_pct:5.1f}%)")
    
    return col_info


## Main Query Function


In [104]:
def query_data_ploidia(patient_id=None, embryo_id=None, unidade=None, show_all=False):
    """
    Query data_ploidia table with various filters.
    
    Args:
        patient_id (int, optional): Filter by Patient ID
        embryo_id (str, optional): Filter by Embryo ID or Video ID
        unidade (str, optional): Filter by Unidade
        show_all (bool): If True, return all data
        
    Returns:
        pandas.DataFrame: Query results
    """
    print(f"\n{'='*80}")
    print(f"QUERYING DATA_PLOIDIA TABLE")
    print(f"Query executed at: {datetime.now()}")
    if patient_id:
        print(f"Filter: Patient ID = {patient_id}")
    if embryo_id:
        print(f"Filter: Embryo ID = {embryo_id}")
    if unidade:
        print(f"Filter: Unidade = {unidade}")
    if show_all:
        print(f"Filter: All data")
    print(f"{'='*80}")
    
    # Connect to database
    conn = get_huntington_connection()
    
    try:
        if show_all:
            df = query_data_ploidia_all(conn)
        elif patient_id:
            df = query_data_ploidia_by_patient(conn, patient_id)
        elif embryo_id:
            df = query_data_ploidia_by_embryo(conn, embryo_id)
        elif unidade:
            df = query_data_ploidia_by_unidade(conn, unidade)
        else:
            # Default: show first 100 rows
            print(f"\n=== GOLD.DATA_PLOIDIA (FIRST 100 ROWS) ===")
            query = """
            SELECT * FROM gold.data_ploidia
            ORDER BY "Patient ID", "Video ID"
            LIMIT 100
            """
            df = conn.execute(query).df()
            print(f"Showing first 100 records")
        
        return df
        
    finally:
        # Close connection
        conn.close()
        print("\nDatabase connection closed.")


## Table Information


In [105]:
# Get table information
conn = get_huntington_connection()
try:
    col_info = get_table_info(conn)
finally:
    conn.close()



=== TABLE INFORMATION ===
Total rows: 10
Total columns: 39

Column names:
   1. Unidade
   2. Video ID
   3. Age
   4. BMI
   5. Birth Year
   6. Diagnosis
   7. Patient Comments
   8. Patient ID
   9. Previus ET
  10. Previus OD ET
  11. Oocyte History
  12. Oocyte Source
  13. Oocytes Aspirated
  14. Slide ID
  15. Well
  16. Embryo ID
  17. t2
  18. t3
  19. t4
  20. t5
  21. t8
  22. tB
  23. tEB
  24. tHB
  25. tM
  26. tPNa
  27. tPNf
  28. tSB
  29. tSC
  30. Frag-2 Cat. - Value
  31. Fragmentation - Value
  32. ICM - Value
  33. MN-2 Type - Value
  34. MN-2 Cells - Value
  35. PN - Value
  36. Pulsing - Value
  37. Re-exp Count - Value
  38. TE - Value
  39. Embryo Description

NULL value counts:
  Unidade                            :      0 (  0.0%)
  Video ID                           :     10 (100.0%)
  Age                                :      0 (  0.0%)
  BMI                                :     10 (100.0%)
  Birth Year                         :      0 (  0.0%)
  Diagnosi

## Example Queries

### 1. Show All Data (First 100 rows)


In [106]:
# Query first 100 rows
df_all = query_data_ploidia()
display(df_all.sort_values(by='Well'))



QUERYING DATA_PLOIDIA TABLE
Query executed at: 2025-11-20 15:46:18.819916



=== GOLD.DATA_PLOIDIA (FIRST 100 ROWS) ===
Showing first 100 records

Database connection closed.


Unnamed: 0,Unidade,Video ID,Age,BMI,Birth Year,Diagnosis,Patient Comments,Patient ID,Previus ET,Previus OD ET,Oocyte History,Oocyte Source,Oocytes Aspirated,Slide ID,Well,Embryo ID,t2,t3,t4,t5,t8,tB,tEB,tHB,tM,tPNa,tPNf,tSB,tSC,Frag-2 Cat. - Value,Fragmentation - Value,ICM - Value,MN-2 Type - Value,MN-2 Cells - Value,PN - Value,Pulsing - Value,Re-exp Count - Value,TE - Value,Embryo Description
1,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-1,1,AA1,26.3,37.5,38.2,52.9,53.6,104.4,,,,,24.3,98.4,,32.0,,,Mono,0.0,2,,,,
2,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-2,2,AA2,22.0,34.9,38.0,47.0,68.3,98.6,,,,,19.4,92.8,,29.3,,B,Bi,1.0,2,,,B,"Aneuploid -1q, XY"
3,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-3,3,AA3,24.4,35.7,36.1,48.9,67.4,98.2,,,,,22.3,92.9,,30.2,,A,Mono,0.0,2,,,A,"Aneuploid -22, XX"
4,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-4,4,AA4,20.3,31.0,31.8,41.6,48.0,95.2,,,,,18.2,86.4,,26.5,,A,Bi,1.0,2,,,B,"Complex Aneuploid +15,+16, XX"
0,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-5,5,AA5,23.5,33.8,34.3,44.4,48.5,92.9,,,,,21.5,86.0,,28.7,,A,Mono,0.0,2,,,A,Euploid XX
5,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-6,6,AA6,27.8,,,,,,,,,,24.1,,,,,,,,2,,,,
6,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-7,7,AA7,29.5,36.6,40.5,,,,,,,,27.1,,,30.2,,,Mono,0.0,2,,,,
7,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-8,8,AA8,22.7,33.6,34.1,45.2,62.8,92.6,,,,,20.6,83.9,,28.2,,B,Mono,0.0,2,,,B,"Complex Aneuploid -9,+16, XX"
8,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-9,9,AA9,,,,,,,,,,,,,,,,,,,2,,,,
9,Ibirapuera,,38.46,,1985,,,823589,,,Fresco,,12,D2024.02.18_S03435_I3166_P-10,10,AA10,25.3,37.9,38.2,53.1,68.8,131.4,,,,,23.5,120.6,,31.6,,B,Bi,1.0,2,,,B,Euploid XY


### 2. Query by Patient ID


In [107]:
# Example: Query by Patient ID
patient_id = 515863  # Change this to your patient ID

# df_patient = query_data_ploidia(patient_id=patient_id)
# display(df_patient)


### 3. Query by Embryo ID / Video ID


In [108]:
# Example: Query by Embryo ID
embryo_id = "E12345"  # Change this to your embryo ID

# df_embryo = query_data_ploidia(embryo_id=embryo_id)
# display(df_embryo)


### 4. Query by Unidade


In [109]:
# Example: Query by Unidade
unidade = "BH"  # Change this to your unidade

# df_unidade = query_data_ploidia(unidade=unidade)
# display(df_unidade)


### 5. Data Quality Checks


In [110]:
# # Check data quality - look for columns with all NULL values
# conn = get_huntington_connection()
# try:
#     print("=== COLUMNS WITH ALL NULL VALUES ===")
#     col_info = conn.execute("DESCRIBE gold.data_ploidia").df()
#     row_count = conn.execute("SELECT COUNT(*) FROM gold.data_ploidia").fetchone()[0]
    
#     all_null_cols = []
#     for col in col_info['column_name']:
#         null_count = conn.execute(f'SELECT COUNT(*) FROM gold.data_ploidia WHERE "{col}" IS NULL').fetchone()[0]
#         if null_count == row_count:
#             all_null_cols.append(col)
#             print(f"  {col} - ALL NULL")
    
#     if not all_null_cols:
#         print("  No columns are completely NULL")
#     else:
#         print(f"\nTotal columns with all NULL: {len(all_null_cols)}")
# finally:
#     conn.close()


### 6. Sample Data by Column Groups


In [111]:
# # Display sample data grouped by column types
# conn = get_huntington_connection()
# try:
#     # Basic info columns
#     print("=== BASIC INFO COLUMNS ===")
#     basic_cols = ['Unidade', 'Video ID', 'Age', 'BMI', 'Birth Year', 'Diagnosis', 
#                   'Patient Comments', 'Patient ID', 'Well', 'Embryo ID']
#     query = f"""
#     SELECT {', '.join([f'"{col}"' for col in basic_cols])}
#     FROM gold.data_ploidia
#     LIMIT 10
#     """
#     df_basic = conn.execute(query).df()
#     display(df_basic)
    
#     print("\n=== TIME ANNOTATIONS ===")
#     time_cols = ['t2', 't3', 't4', 't5', 't8', 'tB', 'tEB', 'tHB', 'tM', 'tPNa', 'tPNf', 'tSB', 'tSC']
#     query = f"""
#     SELECT {', '.join([f'"{col}"' for col in time_cols])}
#     FROM gold.data_ploidia
#     LIMIT 10
#     """
#     df_time = conn.execute(query).df()
#     display(df_time)
    
#     print("\n=== ANNOTATION VALUES ===")
#     annot_cols = ['Frag-2 Cat. - Value', 'Fragmentation - Value', 'ICM - Value', 
#                   'MN-2 Type - Value', 'MN-2 Cells - Value', 'PN - Value', 
#                   'Pulsing - Value', 'Re-exp Count - Value', 'TE - Value', 'Embryo Description']
#     query = f"""
#     SELECT {', '.join([f'"{col}"' for col in annot_cols])}
#     FROM gold.data_ploidia
#     LIMIT 10
#     """
#     df_annot = conn.execute(query).df()
#     display(df_annot)
# finally:
#     conn.close()
