# Data Ploidia Query

This notebook queries the `gold.data_ploidia` table to investigate the results of the column mapping and filtering.

## Table Queried:
- **Huntington Data Lake** (huntington_data_lake.duckdb):
  - gold.data_ploidia (mapped and filtered columns from gold.planilha_embryoscope_combined)


In [1]:
import duckdb
import pandas as pd
import os
from datetime import datetime

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


## Database Connection


In [2]:
def get_huntington_connection():
    """Get connection to huntington_data_lake database"""
    huntington_db_path = os.path.join('..', '..', 'database', 'huntington_data_lake.duckdb')
    return duckdb.connect(huntington_db_path, read_only=True)


## Query Functions


In [3]:
def query_data_ploidia_all(conn):
    """Query all data from gold.data_ploidia"""
    print(f"\n=== GOLD.DATA_PLOIDIA (ALL DATA) ===")
    query = """
    SELECT * FROM gold.data_ploidia
    ORDER BY "Patient ID", "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records in gold.data_ploidia")
    return df

def query_data_ploidia_by_patient(conn, patient_id):
    """Query gold.data_ploidia for a specific patient ID"""
    print(f"\n=== GOLD.DATA_PLOIDIA (PATIENT ID: {patient_id}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Patient ID" = {patient_id}
    ORDER BY "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Patient ID {patient_id}")
    return df

def query_data_ploidia_by_embryo(conn, embryo_id):
    """Query gold.data_ploidia for a specific embryo ID"""
    print(f"\n=== GOLD.DATA_PLOIDIA (EMBRYO ID: {embryo_id}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Embryo ID" = '{embryo_id}' OR "Video ID" = '{embryo_id}'
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Embryo ID {embryo_id}")
    return df

def query_data_ploidia_by_unidade(conn, unidade):
    """Query gold.data_ploidia for a specific unidade"""
    print(f"\n=== GOLD.DATA_PLOIDIA (UNIDADE: {unidade}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Unidade" = '{unidade}'
    ORDER BY "Patient ID", "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Unidade {unidade}")
    return df

def get_table_info(conn):
    """Get basic statistics about the data_ploidia table"""
    print(f"\n=== TABLE INFORMATION ===")
    
    # Row count
    row_count = conn.execute("SELECT COUNT(*) FROM gold.data_ploidia").fetchone()[0]
    print(f"Total rows: {row_count:,}")
    
    # Column count and names
    col_info = conn.execute("DESCRIBE gold.data_ploidia").df()
    print(f"Total columns: {len(col_info)}")
    print(f"\nColumn names:")
    for i, col in enumerate(col_info['column_name'], 1):
        print(f"  {i:2d}. {col}")
    
    # Check for NULL values in each column
    print(f"\nNULL value counts:")
    for col in col_info['column_name']:
        null_count = conn.execute(f'SELECT COUNT(*) FROM gold.data_ploidia WHERE "{col}" IS NULL').fetchone()[0]
        null_pct = (null_count / row_count * 100) if row_count > 0 else 0
        print(f"  {col:35s}: {null_count:6,} ({null_pct:5.1f}%)")
    
    return col_info


## Main Query Function


In [4]:
def query_data_ploidia(patient_id=None, embryo_id=None, unidade=None, show_all=False):
    """
    Query data_ploidia table with various filters.
    
    Args:
        patient_id (int, optional): Filter by Patient ID
        embryo_id (str, optional): Filter by Embryo ID or Video ID
        unidade (str, optional): Filter by Unidade
        show_all (bool): If True, return all data
        
    Returns:
        pandas.DataFrame: Query results
    """
    print(f"\n{'='*80}")
    print(f"QUERYING DATA_PLOIDIA TABLE")
    print(f"Query executed at: {datetime.now()}")
    if patient_id:
        print(f"Filter: Patient ID = {patient_id}")
    if embryo_id:
        print(f"Filter: Embryo ID = {embryo_id}")
    if unidade:
        print(f"Filter: Unidade = {unidade}")
    if show_all:
        print(f"Filter: All data")
    print(f"{'='*80}")
    
    # Connect to database
    conn = get_huntington_connection()
    
    try:
        if show_all:
            df = query_data_ploidia_all(conn)
        elif patient_id:
            df = query_data_ploidia_by_patient(conn, patient_id)
        elif embryo_id:
            df = query_data_ploidia_by_embryo(conn, embryo_id)
        elif unidade:
            df = query_data_ploidia_by_unidade(conn, unidade)
        else:
            # Default: show first 100 rows
            print(f"\n=== GOLD.DATA_PLOIDIA (FIRST 100 ROWS) ===")
            query = """
            SELECT * FROM gold.data_ploidia
            ORDER BY "Patient ID" DESC, "Age" DESC, "Well" ASC
            """
            df = conn.execute(query).df()
            print(f"Showing first 100 records")
        
        return df
        
    finally:
        # Close connection
        conn.close()
        print("\nDatabase connection closed.")


## Table Information


In [5]:
# Get table information
conn = get_huntington_connection()
try:
    col_info = get_table_info(conn)
finally:
    conn.close()



=== TABLE INFORMATION ===
Total rows: 10,877
Total columns: 39

Column names:
   1. Unidade
   2. Video ID
   3. Age
   4. BMI
   5. Birth Year
   6. Diagnosis
   7. Patient Comments
   8. Patient ID
   9. Previus ET
  10. Previus OD ET
  11. Oocyte History
  12. Oocyte Source
  13. Oocytes Aspirated
  14. Slide ID
  15. Well
  16. Embryo ID
  17. t2
  18. t3
  19. t4
  20. t5
  21. t8
  22. tB
  23. tEB
  24. tHB
  25. tM
  26. tPNa
  27. tPNf
  28. tSB
  29. tSC
  30. Frag-2 Cat. - Value
  31. Fragmentation - Value
  32. ICM - Value
  33. MN-2 Type - Value
  34. MN-2 Cells - Value
  35. PN - Value
  36. Pulsing - Value
  37. Re-exp Count - Value
  38. TE - Value
  39. Embryo Description

NULL value counts:
  Unidade                            :      0 (  0.0%)
  Video ID                           :      0 (  0.0%)
  Age                                :     82 (  0.8%)
  BMI                                :  6,340 ( 58.3%)
  Birth Year                         :     82 (  0.8%)
  Diag

## Example Queries

### 1. Show All Data (First 100 rows)


In [6]:
# Query first 100 rows
df_all = query_data_ploidia()
display(df_all.head(10))



QUERYING DATA_PLOIDIA TABLE
Query executed at: 2026-01-22 10:50:20.634970

=== GOLD.DATA_PLOIDIA (FIRST 100 ROWS) ===
Showing first 100 records

Database connection closed.


Unnamed: 0,Unidade,Video ID,Age,BMI,Birth Year,Diagnosis,Patient Comments,Patient ID,Previus ET,Previus OD ET,Oocyte History,Oocyte Source,Oocytes Aspirated,Slide ID,Well,Embryo ID,t2,t3,t4,t5,t8,tB,tEB,tHB,tM,tPNa,tPNf,tSB,tSC,Frag-2 Cat. - Value,Fragmentation - Value,ICM - Value,MN-2 Type - Value,MN-2 Cells - Value,PN - Value,Pulsing - Value,Re-exp Count - Value,TE - Value,Embryo Description
0,Vila Mariana,904254_D2025.11.23_S03920_I3253_P-1,42.15,22.32,1983,Fator feminino anatômico,,904254,0.0,,Fresco,Homólogo,13,D2025.11.23_S03920_I3253_P-1,1,AA1,29.4,40.8,41.0,41.2,55.6,,,,,,27.1,,,0-10%,,,Multi,2,2,,,,"ANEUPLOIDE -10, +14, -18"
1,Vila Mariana,895789_D2025.08.29_S03735_I3253_P-2,43.07,26.35,1982,,,895789,0.0,,Fresco,Homólogo,11,D2025.08.29_S03735_I3253_P-2,2,AA2,22.9,33.6,41.5,44.0,60.5,,,,,,21.0,,,0-10%,,,Multi,2,2,,,,"ANEUPLOIDE +15, -18, -20, -22 XY"
2,Vila Mariana,895789_D2025.08.29_S03735_I3253_P-4,43.07,26.35,1982,,,895789,0.0,,Fresco,Homólogo,11,D2025.08.29_S03735_I3253_P-4,4,AA4,24.0,35.6,35.7,49.2,50.8,,,,,,22.0,,,0-10%,,,Multi,2,2,,,,SEM ANALISE
3,Vila Mariana,895789_D2025.08.29_S03735_I3253_P-5,43.07,26.35,1982,,,895789,0.0,,Fresco,Homólogo,11,D2025.08.29_S03735_I3253_P-5,5,AA5,24.1,37.1,38.5,53.6,58.3,,,,,,22.2,102.0,,0-10%,,,Mono,2,2,,,,SEM ANALISE
4,Vila Mariana,895789_D2025.08.29_S03735_I3253_P-6,43.07,26.35,1982,,,895789,0.0,,Fresco,Homólogo,11,D2025.08.29_S03735_I3253_P-6,6,AA6,22.6,34.0,35.6,51.8,60.5,105.2,,,,,20.7,93.8,,0-10%,,,Mono,2,2,,,,"ANEUPLOIDE +2, +5, +16 XY"
5,Vila Mariana,895368_D2025.08.23_S03724_I3253_P-1,40.83,,1984,,,895368,,,Fresco,,4,D2025.08.23_S03724_I3253_P-1,1,AA1,29.5,40.5,41.1,57.0,59.9,103.7,113.0,120.9,,,27.2,93.9,,10-20%,,B,Bi,1,2,Yes,,C,ANEUPLOIDE -19 XY
6,Vila Mariana,895133_D2025.08.22_S03720_I3253_P-1,43.46,22.14,1982,Insuficiência ovariana,,895133,0.0,,Descongelado OR,Heterólogo,8,D2025.08.22_S03720_I3253_P-1,1,AA1,25.4,37.4,37.8,48.7,67.5,112.0,116.0,118.9,,,22.1,105.9,,0-10%,,A,Mono,0,2,No,,A,EUPLOIDE XY
7,Vila Mariana,895133_D2025.08.22_S03720_I3253_P-2,43.46,22.14,1982,Insuficiência ovariana,,895133,0.0,,Descongelado OR,Heterólogo,8,D2025.08.22_S03720_I3253_P-2,2,AA2,29.3,42.2,43.2,55.2,59.3,102.1,108.6,,,,26.7,95.8,,0-10%,,A,Mono,0,2,No,,A,"ANEUPLOIDE -7, XY"
8,Vila Mariana,895133_D2025.08.22_S03720_I3253_P-3,43.46,22.14,1982,Insuficiência ovariana,,895133,0.0,,Descongelado OR,Heterólogo,8,D2025.08.22_S03720_I3253_P-3,3,AA3,31.2,43.8,45.1,46.6,88.2,146.7,152.5,163.7,,,29.1,135.5,,0-10%,,B,Bi,1,2,Yes,,B,ANORMALIDADE MULTIPLAS XXX
9,Vila Mariana,895133_D2025.08.22_S03720_I3253_P-4,43.46,22.14,1982,Insuficiência ovariana,,895133,0.0,,Descongelado OR,Heterólogo,8,D2025.08.22_S03720_I3253_P-4,4,AA4,26.5,38.0,38.4,51.7,52.9,95.6,104.4,115.5,,,23.3,88.6,,0-10%,,A,Mono,0,2,No,,A,EUPLOIDE XY


In [7]:
patient_id = 107805
display(df_all[df_all['Patient ID']==patient_id])


Unnamed: 0,Unidade,Video ID,Age,BMI,Birth Year,Diagnosis,Patient Comments,Patient ID,Previus ET,Previus OD ET,Oocyte History,Oocyte Source,Oocytes Aspirated,Slide ID,Well,Embryo ID,t2,t3,t4,t5,t8,tB,tEB,tHB,tM,tPNa,tPNf,tSB,tSC,Frag-2 Cat. - Value,Fragmentation - Value,ICM - Value,MN-2 Type - Value,MN-2 Cells - Value,PN - Value,Pulsing - Value,Re-exp Count - Value,TE - Value,Embryo Description
10853,Ibirapuera,107805_D2025.01.19_S03985_I3166_P-1,38.41,22.86,1986,Fator masculino,,107805,0.0,,Fresco,Homólogo,10,D2025.01.19_S03985_I3166_P-1,1,AB1,21.8,32.3,32.5,33.4,52.3,99.0,,,,,19.0,90.3,,10-20%,,A,Mono,0,2,,,B,"Aneuploid +11q, -15, XY"
10854,Ibirapuera,107805_D2025.01.19_S03985_I3166_P-4,38.41,22.86,1986,Fator masculino,,107805,0.0,,Fresco,Homólogo,10,D2025.01.19_S03985_I3166_P-4,4,AB4,24.6,35.5,35.7,50.3,53.4,126.9,,,,,21.6,109.8,,10-20%,,A,Mono,0,2,,,A,"Euploid balanced, XY"
10855,Ibirapuera,107805_D2025.01.19_S03985_I3166_P-6,38.41,22.86,1986,Fator masculino,,107805,0.0,,Fresco,Homólogo,10,D2025.01.19_S03985_I3166_P-6,6,AB6,21.8,32.1,32.5,42.9,64.4,104.7,,,,,19.0,96.9,,0-10%,,A,Mono,0,2,,,B,"Aneuploid +11q, -12, -22, XX"
10856,Ibirapuera,107805_D2025.01.19_S03985_I3166_P-7,38.41,22.86,1986,Fator masculino,,107805,0.0,,Fresco,Homólogo,10,D2025.01.19_S03985_I3166_P-7,7,AB7,23.5,33.2,34.9,47.3,53.2,107.9,,,,,20.6,94.7,,10-20%,,A,Mono,0,2,,,A,"Aneuploid -13, -16,+17p, XY"
10857,Ibirapuera,107805_D2025.01.19_S03985_I3166_P-9,38.41,22.86,1986,Fator masculino,,107805,0.0,,Fresco,Homólogo,10,D2025.01.19_S03985_I3166_P-9,9,AB9,25.3,36.4,51.4,51.6,69.3,102.9,,,,,22.9,92.2,,20-50%,,B,Multi,2,2,,,B,"Aneuploid -1, XX"
10858,Ibirapuera,107805_D2024.07.27_S03829_I3027_P-2,37.93,,1986,,,107805,,,Fresco,,22,D2024.07.27_S03829_I3027_P-2,2,AA2,26.4,38.0,39.3,51.8,74.9,108.3,,,,,24.4,96.8,,10-20%,,B,Bi,1,2,,,B,Complex aneuploid /unbalanced monossomy 21 +16
10859,Ibirapuera,107805_D2024.07.27_S03829_I3027_P-5,37.93,,1986,,,107805,,,Fresco,,22,D2024.07.27_S03829_I3027_P-5,5,AA5,28.5,38.9,39.7,52.0,52.8,104.4,,,,,25.3,94.6,,0-10%,,A,Multi,2,2,,,B,Complex aneuploid /unbalanced monosomy 22
10860,Ibirapuera,107805_D2024.07.27_S03829_I3027_P-6,37.93,,1986,,,107805,,,Fresco,,22,D2024.07.27_S03829_I3027_P-6,6,AA6,26.1,37.6,37.6,50.1,54.5,144.3,,,,,23.3,129.8,,0-10%,,C,Mono,0,2,,,C,"Complex aneuploid /unbalanced monosomy 14,16 and 22"
10861,Ibirapuera,107805_D2024.07.27_S03829_I3027_P-10,37.93,,1986,,,107805,,,Fresco,,22,D2024.07.27_S03829_I3027_P-10,10,AA10,25.5,37.6,40.1,41.5,84.7,123.9,,,,,23.1,113.5,,0-10%,,C,Mono,0,2,,,C,"Complex aneuploid /unbalanced monosomy 14,22, +21"
10862,Ibirapuera,107805_D2024.07.27_S03829_I3027_P-11,37.93,,1986,,,107805,,,Fresco,,22,D2024.07.27_S03829_I3027_P-11,11,AA11,22.6,34.3,35.6,48.3,54.3,102.7,,,,,20.2,93.1,,20-50%,,A,Mono,0,2,,,B,Euploid/ balanced
