# Data Ploidia Query

This notebook queries the `gold.data_ploidia` table to investigate the results of the column mapping and filtering.

## Table Queried:
- **Huntington Data Lake** (huntington_data_lake.duckdb):
  - gold.data_ploidia (mapped and filtered columns from gold.planilha_embryoscope_combined)


In [21]:
import duckdb
import pandas as pd
import os
from datetime import datetime

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


## Database Connection


In [22]:
def get_huntington_connection():
    """Get connection to huntington_data_lake database"""
    huntington_db_path = os.path.join('..', '..', 'database', 'huntington_data_lake.duckdb')
    return duckdb.connect(huntington_db_path, read_only=True)


## Query Functions


In [23]:
def query_data_ploidia_all(conn):
    """Query all data from gold.data_ploidia"""
    print(f"\n=== GOLD.DATA_PLOIDIA (ALL DATA) ===")
    query = """
    SELECT * FROM gold.data_ploidia
    ORDER BY "Patient ID", "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records in gold.data_ploidia")
    return df

def query_data_ploidia_by_patient(conn, patient_id):
    """Query gold.data_ploidia for a specific patient ID"""
    print(f"\n=== GOLD.DATA_PLOIDIA (PATIENT ID: {patient_id}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Patient ID" = {patient_id}
    ORDER BY "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Patient ID {patient_id}")
    return df

def query_data_ploidia_by_embryo(conn, embryo_id):
    """Query gold.data_ploidia for a specific embryo ID"""
    print(f"\n=== GOLD.DATA_PLOIDIA (EMBRYO ID: {embryo_id}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Embryo ID" = '{embryo_id}' OR "Video ID" = '{embryo_id}'
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Embryo ID {embryo_id}")
    return df

def query_data_ploidia_by_unidade(conn, unidade):
    """Query gold.data_ploidia for a specific unidade"""
    print(f"\n=== GOLD.DATA_PLOIDIA (UNIDADE: {unidade}) ===")
    query = f"""
    SELECT * FROM gold.data_ploidia 
    WHERE "Unidade" = '{unidade}'
    ORDER BY "Patient ID", "Video ID"
    """
    
    df = conn.execute(query).df()
    print(f"Found {len(df)} records for Unidade {unidade}")
    return df

def get_table_info(conn):
    """Get basic statistics about the data_ploidia table"""
    print(f"\n=== TABLE INFORMATION ===")
    
    # Row count
    row_count = conn.execute("SELECT COUNT(*) FROM gold.data_ploidia").fetchone()[0]
    print(f"Total rows: {row_count:,}")
    
    # Column count and names
    col_info = conn.execute("DESCRIBE gold.data_ploidia").df()
    print(f"Total columns: {len(col_info)}")
    print(f"\nColumn names:")
    for i, col in enumerate(col_info['column_name'], 1):
        print(f"  {i:2d}. {col}")
    
    # Check for NULL values in each column
    print(f"\nNULL value counts:")
    for col in col_info['column_name']:
        null_count = conn.execute(f'SELECT COUNT(*) FROM gold.data_ploidia WHERE "{col}" IS NULL').fetchone()[0]
        null_pct = (null_count / row_count * 100) if row_count > 0 else 0
        print(f"  {col:35s}: {null_count:6,} ({null_pct:5.1f}%)")
    
    return col_info


## Main Query Function


In [24]:
def query_data_ploidia(patient_id=None, embryo_id=None, unidade=None, show_all=False):
    """
    Query data_ploidia table with various filters.
    
    Args:
        patient_id (int, optional): Filter by Patient ID
        embryo_id (str, optional): Filter by Embryo ID or Video ID
        unidade (str, optional): Filter by Unidade
        show_all (bool): If True, return all data
        
    Returns:
        pandas.DataFrame: Query results
    """
    print(f"\n{'='*80}")
    print(f"QUERYING DATA_PLOIDIA TABLE")
    print(f"Query executed at: {datetime.now()}")
    if patient_id:
        print(f"Filter: Patient ID = {patient_id}")
    if embryo_id:
        print(f"Filter: Embryo ID = {embryo_id}")
    if unidade:
        print(f"Filter: Unidade = {unidade}")
    if show_all:
        print(f"Filter: All data")
    print(f"{'='*80}")
    
    # Connect to database
    conn = get_huntington_connection()
    
    try:
        if show_all:
            df = query_data_ploidia_all(conn)
        elif patient_id:
            df = query_data_ploidia_by_patient(conn, patient_id)
        elif embryo_id:
            df = query_data_ploidia_by_embryo(conn, embryo_id)
        elif unidade:
            df = query_data_ploidia_by_unidade(conn, unidade)
        else:
            # Default: show first 100 rows
            print(f"\n=== GOLD.DATA_PLOIDIA (FIRST 100 ROWS) ===")
            query = """
            SELECT * FROM gold.data_ploidia
            ORDER BY "Patient ID" DESC, "Age" DESC, "Well" ASC
            """
            df = conn.execute(query).df()
            print(f"Showing first 100 records")
        
        return df
        
    finally:
        # Close connection
        conn.close()
        print("\nDatabase connection closed.")


## Table Information


In [25]:
# Get table information
conn = get_huntington_connection()
try:
    col_info = get_table_info(conn)
finally:
    conn.close()



=== TABLE INFORMATION ===
Total rows: 83,638
Total columns: 49

Column names:
   1. Unidade
   2. Video ID
   3. Age
   4. BMI
   5. Birth Year
   6. Diagnosis
   7. Patient Comments
   8. Patient ID
   9. Previus ET
  10. Previus OD ET
  11. Oocyte History
  12. Oocyte Source
  13. Oocytes Aspirated
  14. Slide ID
  15. Well
  16. Embryo ID
  17. t2
  18. t3
  19. t4
  20. t5
  21. t8
  22. tB
  23. tEB
  24. tHB
  25. tM
  26. tPNa
  27. tPNf
  28. tSB
  29. tSC
  30. Frag-2 Cat. - Value
  31. Fragmentation - Value
  32. ICM - Value
  33. MN-2 Type - Value
  34. MN-2 Cells - Value
  35. PN - Value
  36. Pulsing - Value
  37. Re-exp Count - Value
  38. TE - Value
  39. Embryo Description
  40. Embryo Description Clinisys
  41. Embryo Description Clinisys Detalhes
  42. outcome_type
  43. merged_numero_de_nascidos
  44. fet_gravidez_clinica
  45. trat2_resultado_tratamento
  46. trat1_resultado_tratamento
  47. fet_tipo_resultado
  48. api_response_code
  49. api_error_message

NULL v

## Example Queries

### 1. Show All Data (First 100 rows)


In [26]:
# Query first 100 rows
df_all = query_data_ploidia()
display(df_all.head(10))



QUERYING DATA_PLOIDIA TABLE
Query executed at: 2026-02-06 09:05:11.493617

=== GOLD.DATA_PLOIDIA (FIRST 100 ROWS) ===
Showing first 100 records

Database connection closed.


Unnamed: 0,Unidade,Video ID,Age,BMI,Birth Year,Diagnosis,Patient Comments,Patient ID,Previus ET,Previus OD ET,Oocyte History,Oocyte Source,Oocytes Aspirated,Slide ID,Well,Embryo ID,t2,t3,t4,t5,t8,tB,tEB,tHB,tM,tPNa,tPNf,tSB,tSC,Frag-2 Cat. - Value,Fragmentation - Value,ICM - Value,MN-2 Type - Value,MN-2 Cells - Value,PN - Value,Pulsing - Value,Re-exp Count - Value,TE - Value,Embryo Description,Embryo Description Clinisys,Embryo Description Clinisys Detalhes,outcome_type,merged_numero_de_nascidos,fet_gravidez_clinica,trat2_resultado_tratamento,trat1_resultado_tratamento,fet_tipo_resultado,api_response_code,api_error_message
0,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-1,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-1,1,AA1,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
1,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-2,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-2,2,AA2,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
2,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-4,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-4,4,AA4,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
3,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-5,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-5,5,AA5,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
4,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-7,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-7,7,AA7,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
5,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-8,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-8,8,AA8,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
6,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-9,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-9,9,AA9,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
7,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-10,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-10,10,AA10,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
8,Vila Mariana,906658_D2025.12.14_S03968_I3253_P-11,39.73,22.39,1986,Inexplicado,,906658,0,0,Descongelado OR,Heterólogo,11,D2025.12.14_S03968_I3253_P-11,11,AA11,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK
9,Vila Mariana,905932_D2025.12.13_S03967_I3253_P-2,41.73,24.97,1984,,,905932,0,0,Fresco,Homólogo,3,D2025.12.13_S03967_I3253_P-2,2,AA2,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,200,OK


In [27]:
patient_id = 889071
display(df_all[df_all['Patient ID']==patient_id])


Unnamed: 0,Unidade,Video ID,Age,BMI,Birth Year,Diagnosis,Patient Comments,Patient ID,Previus ET,Previus OD ET,Oocyte History,Oocyte Source,Oocytes Aspirated,Slide ID,Well,Embryo ID,t2,t3,t4,t5,t8,tB,tEB,tHB,tM,tPNa,tPNf,tSB,tSC,Frag-2 Cat. - Value,Fragmentation - Value,ICM - Value,MN-2 Type - Value,MN-2 Cells - Value,PN - Value,Pulsing - Value,Re-exp Count - Value,TE - Value,Embryo Description,Embryo Description Clinisys,Embryo Description Clinisys Detalhes,outcome_type,merged_numero_de_nascidos,fet_gravidez_clinica,trat2_resultado_tratamento,trat1_resultado_tratamento,fet_tipo_resultado,api_response_code,api_error_message
2079,Brasilia,889071_D2025.07.14_S02420_I4120_P-1,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-1,1,AA1,,,,,,,,,,8.4,,,,,,,,,2,,,,,,,,,1,,No transfer,POSITIVO,200,OK
2080,Brasilia,889071_D2025.07.14_S02420_I4120_P-2,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-2,2,AA2,,,,,,,,,,8.9,,,,,,,,,2,,,,,Não analisado,,,,1,,No transfer,POSITIVO,200,OK
2081,Brasilia,889071_D2025.07.14_S02420_I4120_P-3,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-3,3,AA3,,,,,,,,,,6.6,,,,,,,,,2,,,,,,,,,1,,No transfer,POSITIVO,200,OK
2082,Brasilia,889071_D2025.07.14_S02420_I4120_P-4,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-4,4,AA4,,,,,,,,,,14.2,24.2,,,,,,,,2,,,,,,,,,1,,No transfer,POSITIVO,200,OK
2083,Brasilia,889071_D2025.07.14_S02420_I4120_P-5,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-5,5,AA5,,,,,,,,,,,,,,,,,,,0,,,,,,,,,1,,No transfer,POSITIVO,200,OK
2084,Brasilia,889071_D2025.07.14_S02420_I4120_P-6,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-6,6,AA6,,,,,,,,,,8.1,,,,,,,,,2,,,,,Não analisado,,,,1,,No transfer,POSITIVO,200,OK
2085,Brasilia,889071_D2025.07.14_S02420_I4120_P-7,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-7,7,AA7,,,,,,,,,,8.1,,,,,,,,,2,,,,,Não analisado,,,,1,,No transfer,POSITIVO,200,OK
2086,Brasilia,889071_D2025.07.14_S02420_I4120_P-8,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-8,8,AA8,,,,,,,,,,9.1,24.9,,,,,,,,2,,,,,Euploide,"XX, Sem alterações",,,1,,No transfer,POSITIVO,200,OK
2087,Brasilia,889071_D2025.07.14_S02420_I4120_P-9,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-9,9,AA9,,,,,,,,,,7.8,,,,,,,,,2,,,,,,,,,1,,No transfer,POSITIVO,200,OK
2088,Brasilia,889071_D2025.07.14_S02420_I4120_P-10,32.22,23.74,1993,Outros,,889071,0,0,Fresco,Homólogo,20,D2025.07.14_S02420_I4120_P-10,10,AA10,,,,,,,,,,,,,,,,,,,0,,,,,,,,,1,,No transfer,POSITIVO,200,OK


### Análise de BH

In [None]:
df_all['Unidade'].value_counts(dropna=False)

Unidade
Ibirapuera        29820
Vila Mariana      23649
Belo Horizonte    18386
Brasilia          11783
Name: count, dtype: int64

: 

In [29]:
df_bh = df_all[df_all['Unidade']=='Belo Horizonte']

df_bh.shape

(18386, 49)

In [30]:
df_bh['api_response_code'].value_counts(dropna=False)

api_response_code
204    12095
200     6290
0          1
Name: count, dtype: int64

In [31]:
df_bh[df_bh['Embryo Description Clinisys'].isnull()].shape

(16374, 49)

In [32]:
df_bh[~df_bh['Embryo Description Clinisys'].isnull()].shape

(2012, 49)

In [33]:
df_bh_200 = df_bh[df_bh['api_response_code']==200]

df_bh_200.shape

(6290, 49)

In [34]:
df_bh_200[~(df_bh_200['Embryo Description'].isnull())].shape

(0, 49)

In [35]:
df_bh_200[~(df_bh_200['Embryo Description Clinisys'].isnull())].head(10)

Unnamed: 0,Unidade,Video ID,Age,BMI,Birth Year,Diagnosis,Patient Comments,Patient ID,Previus ET,Previus OD ET,Oocyte History,Oocyte Source,Oocytes Aspirated,Slide ID,Well,Embryo ID,t2,t3,t4,t5,t8,tB,tEB,tHB,tM,tPNa,tPNf,tSB,tSC,Frag-2 Cat. - Value,Fragmentation - Value,ICM - Value,MN-2 Type - Value,MN-2 Cells - Value,PN - Value,Pulsing - Value,Re-exp Count - Value,TE - Value,Embryo Description,Embryo Description Clinisys,Embryo Description Clinisys Detalhes,outcome_type,merged_numero_de_nascidos,fet_gravidez_clinica,trat2_resultado_tratamento,trat1_resultado_tratamento,fet_tipo_resultado,api_response_code,api_error_message
123,Belo Horizonte,904096_D2025.11.28_S03261_I3254_P-1,36.84,22.86,1989,,,904096,0,0,Fresco,Homólogo,3,D2025.11.28_S03261_I3254_P-1,1,AA1,23.1,33.9,34.5,47.5,54.8,99.7,108.1,,88.6,8.1,21.2,90.2,78.6,,28.8,A,,,2,,,A,,Euploide,,,,,,No transfer,,200,OK
178,Belo Horizonte,902693_D2025.12.13_S03285_I3254_P-1,40.56,23.44,1985,Insuficiência ovariana,,902693,0,0,Fresco,Homólogo,3,D2025.12.13_S03285_I3254_P-1,1,AA1,26.0,35.5,36.5,46.5,64.8,101.7,109.5,,86.4,5.6,23.8,91.2,65.9,,31.1,A,,,2,,,A,,Euploide,,,,,,No transfer,,200,OK
179,Belo Horizonte,902693_D2025.12.13_S03285_I3254_P-2,40.56,23.44,1985,Insuficiência ovariana,,902693,0,0,Fresco,Homólogo,3,D2025.12.13_S03285_I3254_P-2,2,AA2,24.6,35.7,36.7,49.5,66.1,111.2,115.9,,94.2,6.0,22.0,99.3,73.6,,29.8,A,,,2,,,A,,Euploide,,,,,,No transfer,,200,OK
282,Belo Horizonte,901091_D2025.12.03_S03265_I3254_P-4,43.57,21.97,1982,Outros,,901091,0,0,Fresco,Homólogo,10,D2025.12.03_S03265_I3254_P-4,4,AA4,25.3,36.3,37.4,47.6,58.5,102.6,106.8,,87.1,5.0,23.8,95.4,78.4,,31.7,A,,,2,,,A,,Euploide,,,,,,No transfer,,200,OK
283,Belo Horizonte,901091_D2025.12.03_S03265_I3254_P-5,43.57,21.97,1982,Outros,,901091,0,0,Fresco,Homólogo,10,D2025.12.03_S03265_I3254_P-5,5,AA5,29.4,40.0,42.9,52.2,57.6,102.9,108.4,,92.0,6.2,27.2,96.4,75.9,,34.6,A,,,2,,,A,,Aneuploide,,,,,,No transfer,,200,OK
284,Belo Horizonte,901091_D2025.12.03_S03265_I3254_P-6,43.57,21.97,1982,Outros,,901091,0,0,Fresco,Homólogo,10,D2025.12.03_S03265_I3254_P-6,6,AA6,24.6,34.8,35.6,47.2,52.8,107.9,117.1,,87.1,6.2,22.6,93.5,66.5,,29.3,A,,,2,,,A,,Aneuploide,,,,,,No transfer,,200,OK
311,Belo Horizonte,900599_D2025.11.25_S03252_I3254_P-4,38.08,24.98,1987,Insuficiência ovariana,,900599,0,0,Fresco,Homólogo,5,D2025.11.25_S03252_I3254_P-4,4,AA4,25.4,36.1,36.5,48.2,52.5,108.2,121.4,,85.6,6.1,23.7,89.1,74.5,,32.9,B,,,2,,,C,,Aneuploide,,,,,,No transfer,,200,OK
503,Belo Horizonte,898466_D2025.11.17_S03236_I3254_P-1,35.88,20.8,1990,Outros,,898466,0,0,Fresco,Homólogo,6,D2025.11.17_S03236_I3254_P-1,1,AC1,23.3,35.3,36.0,48.0,52.6,111.7,112.6,,98.9,4.6,21.3,106.1,71.4,,28.9,A,,,2,,,A,,Euploide,,,,,,No transfer,,200,OK
504,Belo Horizonte,898466_D2025.11.17_S03236_I3254_P-2,35.88,20.8,1990,Outros,,898466,0,0,Fresco,Homólogo,6,D2025.11.17_S03236_I3254_P-2,2,AC2,30.5,42.2,43.1,53.3,62.0,124.9,130.0,,112.3,7.7,28.6,118.3,94.2,,36.9,A,,,2,,,A,,Euploide,,,,,,No transfer,,200,OK
506,Belo Horizonte,898466_D2025.11.17_S03236_I3254_P-4,35.88,20.8,1990,Outros,,898466,0,0,Fresco,Homólogo,6,D2025.11.17_S03236_I3254_P-4,4,AC4,34.4,48.0,48.6,66.1,80.1,120.7,131.2,,104.8,7.9,33.2,107.2,88.7,,41.2,A,,,2,,,B,,Aneuploide,,,,,,No transfer,,200,OK
