# Embryo Image Availability Report - Data Explorer

This notebook queries and explores the image availability data created by the report scripts.

## Tables:
- **gold.embryo_image_availability_raw** - Raw table with all original columns + API check status

In [139]:
import duckdb
import pandas as pd
import os
from datetime import datetime

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

## Database Connection

In [140]:
# Connect to huntington_data_lake database
db_path = os.path.join('..', '..', '..', 'database', 'huntington_data_lake.duckdb')
conn = duckdb.connect(db_path, read_only=True)

print(f"Connected to: {db_path}")

Connected to: ..\..\..\database\huntington_data_lake.duckdb


In [141]:
# Get row count
row_count = conn.execute("""
    SELECT COUNT(*) as total_rows
    FROM gold.embryo_image_availability_raw
""").df()

print(f"Total rows: {row_count['total_rows'][0]:,}")

Total rows: 115,361


## Summary Statistics

In [142]:
# Overall summary
overall_summary = conn.execute("""
    SELECT *
    FROM gold.embryo_image_availability_raw
""").df()


print(f"Total embryos: {len(overall_summary)}")
print(f"Patient count: {overall_summary['prontuario'].nunique()}")
overall_summary

Total embryos: 115361
Patient count: 10681


Unnamed: 0,prontuario,patient_PatientID,patient_PatientIDx,patient_unit_huntington,treatment_TreatmentName,embryo_EmbryoID,embryo_EmbryoDate,image_available,image_runs_count,api_response_status,api_response_code,error_message,checked_at
0,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,D2026.01.15_S02549_I4120_P-15,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:27,790"
1,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,D2026.01.15_S02549_I4120_P-4,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:28,223"
2,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,D2026.01.15_S02549_I4120_P-16,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:27,901"
3,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,D2026.01.15_S02549_I4120_P-3,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:28,090"
4,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,D2026.01.15_S02549_I4120_P-2,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:27,995"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
115356,153709,55562,NEXTGEN_43052.4993192940,Ibirapuera,2022-742,D2022.06.21_S02370_I3166_P-6,2022-06-21,True,930.0,success,200,OK,"2026-01-27 17:27:29,963"
115357,153709,55562,NEXTGEN_43052.4993192940,Ibirapuera,2022-742,D2022.06.21_S02370_I3166_P-5,2022-06-21,True,930.0,success,200,OK,"2026-01-27 17:27:29,882"
115358,153709,55562,NEXTGEN_43052.4993192940,Ibirapuera,2022-742,D2022.06.21_S02370_I3166_P-4,2022-06-21,True,930.0,success,200,OK,"2026-01-27 17:27:29,786"
115359,153709,55562,NEXTGEN_43052.4993192940,Ibirapuera,2022-742,D2022.06.21_S02370_I3166_P-3,2022-06-21,True,930.0,success,200,OK,"2026-01-27 17:27:29,657"


In [143]:
overall_summary.api_response_code.value_counts(dropna=False)

api_response_code
200    89460
204    22599
500     3292
0         10
Name: count, dtype: int64

In [144]:
# Summary by server
server_summary = conn.execute("""
    SELECT 
        patient_unit_huntington as server,
        COUNT(*) as total_embryos,
        COUNT(DISTINCT prontuario) as unique_patients,
        SUM(CASE WHEN image_available THEN 1 ELSE 0 END) as with_images,
        SUM(CASE WHEN NOT image_available THEN 1 ELSE 0 END) as without_images,
        ROUND(AVG(CASE WHEN image_available THEN 1.0 ELSE 0.0 END) * 100, 2) as pct_with_images,
        ROUND(AVG(image_runs_count), 2) as avg_image_runs
    FROM gold.embryo_image_availability_raw
    GROUP BY patient_unit_huntington
    ORDER BY patient_unit_huntington
""").df()

print("\n" + "=" * 80)
print("SUMMARY BY SERVER")
print("=" * 80)
server_summary


SUMMARY BY SERVER


Unnamed: 0,server,total_embryos,unique_patients,with_images,without_images,pct_with_images,avg_image_runs
0,Belo Horizonte,18864,1842,6303.0,12561.0,33.41,271.81
1,Brasilia,14613,1533,8926.0,5687.0,61.08,497.46
2,Ibirapuera,52107,5094,44970.0,7137.0,86.3,690.2
3,Vila Mariana,29777,2706,29261.0,516.0,98.27,758.7


In [145]:
# API response status breakdown
status_summary = conn.execute("""
    SELECT 
        api_response_status,
        api_response_code, 
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
    FROM gold.embryo_image_availability_raw
    GROUP BY api_response_status, api_response_code
    ORDER BY count DESC
""").df()

print("\n" + "=" * 80)
print("API RESPONSE STATUS BREAKDOWN")
print("=" * 80)
status_summary


API RESPONSE STATUS BREAKDOWN


Unnamed: 0,api_response_status,api_response_code,count,percentage
0,success,200,89460,77.55
1,silent_response,204,22599,19.59
2,error,500,3292,2.85
3,not_checked,0,10,0.01


## Sample Data

In [146]:
# Sample of embryos WITH images
with_images_sample = conn.execute("""
    SELECT 
        prontuario,
        patient_unit_huntington,
        embryo_EmbryoID,
        embryo_EmbryoDate,
        image_runs_count,
        api_response_status,
        api_response_code, 
        checked_at
    FROM gold.embryo_image_availability_raw
    WHERE image_available = true
    ORDER BY image_runs_count DESC
    LIMIT 10
""").df()

print("\n" + "=" * 80)
print("SAMPLE: EMBRYOS WITH IMAGES (Top 10 by image runs count)")
print("=" * 80)
with_images_sample


SAMPLE: EMBRYOS WITH IMAGES (Top 10 by image runs count)


Unnamed: 0,prontuario,patient_unit_huntington,embryo_EmbryoID,embryo_EmbryoDate,image_runs_count,api_response_status,api_response_code,checked_at
0,748654,Brasilia,D2024.01.05_S01996_I4120_P-1,2024-01-05,1060.0,success,200,"2026-01-27 19:13:16,257"
1,748654,Brasilia,D2024.01.05_S01996_I4120_P-2,2024-01-05,1060.0,success,200,"2026-01-27 19:13:16,355"
2,743110,Brasilia,D2024.01.11_S01997_I4120_P-3,2024-01-11,1059.0,success,200,"2026-01-27 19:12:38,542"
3,743110,Brasilia,D2024.01.11_S01997_I4120_P-2,2024-01-11,1059.0,success,200,"2026-01-27 19:12:38,429"
4,743110,Brasilia,D2024.01.11_S01997_I4120_P-1,2024-01-11,1059.0,success,200,"2026-01-27 19:12:38,342"
5,743110,Brasilia,D2024.01.11_S01997_I4120_P-4,2024-01-11,1059.0,success,200,"2026-01-27 19:12:38,668"
6,800751,Brasilia,D2023.07.08_S01871_I4120_P-1,2023-07-08,1054.0,success,200,"2026-01-27 19:16:37,844"
7,800751,Brasilia,D2023.07.08_S01871_I4120_P-2,2023-07-08,1054.0,success,200,"2026-01-27 19:16:37,953"
8,800751,Brasilia,D2023.07.08_S01871_I4120_P-3,2023-07-08,1054.0,success,200,"2026-01-27 19:16:38,055"
9,800751,Brasilia,D2023.07.08_S01871_I4120_P-4,2023-07-08,1054.0,success,200,"2026-01-27 19:16:38,159"


In [147]:
# Sample of embryos WITHOUT images
without_images_sample = conn.execute("""
    SELECT 
        prontuario,
        patient_unit_huntington,
        embryo_EmbryoID,
        embryo_EmbryoDate,
        api_response_status,
        api_response_code
        error_message,
        checked_at
    FROM gold.embryo_image_availability_raw
    WHERE image_available = false
    LIMIT 10
""").df()

print("\n" + "=" * 80)
print("SAMPLE: EMBRYOS WITHOUT IMAGES (First 10)")
print("=" * 80)
without_images_sample


SAMPLE: EMBRYOS WITHOUT IMAGES (First 10)


Unnamed: 0,prontuario,patient_unit_huntington,embryo_EmbryoID,embryo_EmbryoDate,api_response_status,error_message,checked_at
0,901117,Brasilia,D2025.12.06_S02539_I4120_P-6,2025-12-06,silent_response,204,"2026-01-27 19:22:13,048"
1,833149,Brasilia,D2025.11.27_S02527_I4120_P-8,2025-11-27,silent_response,204,"2026-01-27 19:19:02,851"
2,833149,Brasilia,D2025.11.27_S02527_I4120_P-2,2025-11-27,silent_response,204,"2026-01-27 19:19:02,161"
3,899827,Brasilia,D2025.11.21_S02524_I4120_P-5,2025-11-21,silent_response,204,"2026-01-27 19:22:10,773"
4,891183,Brasilia,D2025.11.19_S02523_I4120_P-1,2025-11-19,silent_response,204,"2026-01-27 19:22:00,679"
5,880346,Brasilia,D2025.10.27_S02504_I4120_P-10,2025-10-27,silent_response,204,"2026-01-27 19:21:37,652"
6,894977,Brasilia,D2025.10.24_S02502_I4120_P-3,2025-10-24,silent_response,204,"2026-01-27 19:22:05,957"
7,879286,Brasilia,D2025.08.22_S02451_I4120_P-15,2025-08-22,silent_response,204,"2026-01-27 19:21:31,574"
8,886269,Brasilia,D2025.07.30_S02436_I4120_P-1,2025-07-30,silent_response,204,"2026-01-27 19:21:49,499"
9,862064,Brasilia,D2025.02.06_S02312_I4120_P-7,2025-02-06,silent_response,204,"2026-01-27 19:20:31,362"


## Query by Prontuario

In [148]:
# Query specific prontuario
prontuario = 895384  # Change this to query a different patient

patient_data = conn.execute(f"""
    SELECT *
    FROM gold.embryo_image_availability_raw
    WHERE prontuario = {prontuario}
    ORDER BY embryo_EmbryoDate DESC
""").df()

print(f"\n{'=' * 80}")
print(f"DATA FOR PRONTUARIO: {prontuario}")
print(f"{'=' * 80}")
print(f"Total embryos: {len(patient_data)}")
print(f"With images: {patient_data['image_available'].sum()}")
print(f"Without images: {(~patient_data['image_available']).sum()}")
print("\n")
patient_data


DATA FOR PRONTUARIO: 895384
Total embryos: 12
With images: 0
Without images: 12




Unnamed: 0,prontuario,patient_PatientID,patient_PatientIDx,patient_unit_huntington,treatment_TreatmentName,embryo_EmbryoID,embryo_EmbryoDate,image_available,image_runs_count,api_response_status,api_response_code,error_message,checked_at
0,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-11,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:00,398"
1,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-10,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:06:59,134"
2,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-5,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:06,670"
3,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-9,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:11,797"
4,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-8,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:10,555"
5,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-7,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:09,230"
6,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-3,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:04,123"
7,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-4,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:05,394"
8,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-2,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:07:02,907"
9,895384,895384,PC1P7BHG_45996.4013179398,Ibirapuera,2025-2760,D2025.12.05_S04662_I3166_P-1,2025-12-05,False,0.0,silent_response,204,No images found (Empty response),"2026-01-28 05:06:57,932"


## Distribution Analysis

In [149]:
# Image runs distribution
runs_distribution = conn.execute("""
    SELECT 
        image_runs_count,
        COUNT(*) as embryo_count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
    FROM gold.embryo_image_availability_raw
    WHERE image_available = true
    GROUP BY image_runs_count
    ORDER BY image_runs_count
""").df()

print("\n" + "=" * 80)
print("IMAGE RUNS DISTRIBUTION (for embryos with images)")
print("=" * 80)
runs_distribution.head(20)


IMAGE RUNS DISTRIBUTION (for embryos with images)


Unnamed: 0,image_runs_count,embryo_count,percentage
0,1.0,135,0.15
1,2.0,113,0.13
2,3.0,27,0.03
3,4.0,28,0.03
4,5.0,47,0.05
5,6.0,38,0.04
6,7.0,16,0.02
7,8.0,15,0.02
8,9.0,11,0.01
9,10.0,39,0.04


In [150]:
# Temporal distribution
temporal_dist = conn.execute("""
    SELECT 
        DATE_TRUNC('month', embryo_EmbryoDate) as month,
        COUNT(*) as total_embryos,
        SUM(CASE WHEN image_available THEN 1 ELSE 0 END) as with_images,
        ROUND(AVG(CASE WHEN image_available THEN 1.0 ELSE 0.0 END) * 100, 2) as pct_with_images
    FROM gold.embryo_image_availability_raw
    GROUP BY DATE_TRUNC('month', embryo_EmbryoDate)
    ORDER BY month DESC
    LIMIT 12
""").df()

print("\n" + "=" * 80)
print("TEMPORAL DISTRIBUTION (Last 12 months)")
print("=" * 80)
temporal_dist


TEMPORAL DISTRIBUTION (Last 12 months)


Unnamed: 0,month,total_embryos,with_images,pct_with_images
0,2026-01-01,151,149.0,98.68
1,2025-12-01,842,469.0,55.7
2,2025-11-01,1522,815.0,53.55
3,2025-10-01,1303,744.0,57.1
4,2025-09-01,1394,791.0,56.74
5,2025-08-01,1212,673.0,55.53
6,2025-07-01,1326,702.0,52.94
7,2025-06-01,1690,998.0,59.05
8,2025-05-01,1741,1001.0,57.5
9,2025-04-01,1971,1227.0,62.25


## Error Analysis

In [151]:
# Errors by server
errors_by_server = conn.execute("""
    SELECT 
        patient_unit_huntington as server,
        api_response_code,
        COUNT(*) as count
    FROM gold.embryo_image_availability_raw
    WHERE api_response_status != 'success'
    GROUP BY patient_unit_huntington, api_response_code
    ORDER BY patient_unit_huntington, count DESC
""").df()

print("\n" + "=" * 80)
print("ERRORS BY SERVER")
print("=" * 80)
if len(errors_by_server) > 0:
    display(errors_by_server)
else:
    print("No errors found!")


ERRORS BY SERVER


Unnamed: 0,server,api_response_code,count
0,Belo Horizonte,204,12560
1,Belo Horizonte,0,1
2,Brasilia,204,5684
3,Brasilia,0,3
4,Ibirapuera,204,3842
5,Ibirapuera,500,3292
6,Ibirapuera,0,3
7,Vila Mariana,204,513
8,Vila Mariana,0,3


In [152]:
# Sample error messages
error_messages = conn.execute("""
    SELECT 
        patient_unit_huntington,
        api_response_status,
        error_message,
        COUNT(*) as count
    FROM gold.embryo_image_availability_raw
    WHERE error_message IS NOT NULL
    GROUP BY patient_unit_huntington, api_response_status, error_message
    ORDER BY count DESC
    LIMIT 10
""").df()

print("\n" + "=" * 80)
print("ERROR MESSAGES (Top 10)")
print("=" * 80)
if len(error_messages) > 0:
    display(error_messages)
else:
    print("No error messages found!")


ERROR MESSAGES (Top 10)


Unnamed: 0,patient_unit_huntington,api_response_status,error_message,count
0,Ibirapuera,success,OK,44970
1,Vila Mariana,success,OK,29261
2,Belo Horizonte,silent_response,No images found (Empty response),12560
3,Brasilia,success,OK,8926
4,Belo Horizonte,success,OK,6303
5,Brasilia,silent_response,No images found (Empty response),5684
6,Ibirapuera,silent_response,No images found (Empty response),3842
7,Ibirapuera,error,Unexpected error during data access,3292
8,Vila Mariana,silent_response,No images found (Empty response),513
9,Ibirapuera,not_checked,Not Checked,3


## Close Connection

In [153]:
conn.close()
print("Database connection closed.")

Database connection closed.
