# Embryo Image Availability Explorer v2

This notebook provides a structured way to explore the embryo image availability data recovered from logs.

**Pattern**: All queries are executed at the beginning and the connection is closed, allowing for offline analysis of the fetched DataFrames.

In [40]:
import duckdb
import pandas as pd
import os
from datetime import datetime

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## 1. Define Query Functions

In [41]:
def get_connection():
    """Get connection to huntington_data_lake database"""
    db_path = os.path.join('..', '..', '..', 'database', 'huntington_data_lake.duckdb')
    return duckdb.connect(db_path, read_only=True)

def fetch_overall_summary(conn):
    """Get high-level summary of availability"""
    return conn.execute("""
        SELECT 
            api_response_code,
            error_message as status_description,
            COUNT(*) as total_embryos,
            ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
        FROM gold.embryo_image_availability_raw
        GROUP BY 1, 2
        ORDER BY 3 DESC
    """).df()

def fetch_server_summary(conn):
    """Get summary statistics per huntington unit/server"""
    return conn.execute("""
        SELECT 
            patient_unit_huntington as server,
            COUNT(*) as total_embryos,
            COUNT(DISTINCT prontuario) as unique_patients,
            SUM(CASE WHEN image_available THEN 1 ELSE 0 END) as with_images,
            SUM(CASE WHEN NOT image_available THEN 1 ELSE 0 END) as without_images,
            ROUND(AVG(CASE WHEN image_available THEN 1.0 ELSE 0.0 END) * 100, 2) as pct_with_images,
            ROUND(AVG(image_runs_count), 2) as avg_image_runs
        FROM gold.embryo_image_availability_raw
        GROUP BY patient_unit_huntington
        ORDER BY patient_unit_huntington
    """).df()

def fetch_temporal_distribution(conn):
    """Evolution of image availability over time (last 12 months)"""
    return conn.execute("""
        SELECT 
            DATE_TRUNC('month', embryo_EmbryoDate) as month,
            COUNT(*) as total_embryos,
            SUM(CASE WHEN image_available THEN 1 ELSE 0 END) as with_images,
            ROUND(AVG(CASE WHEN image_available THEN 1.0 ELSE 0.0 END) * 100, 2) as pct_with_images
        FROM gold.embryo_image_availability_raw
        GROUP BY DATE_TRUNC('month', embryo_EmbryoDate)
        ORDER BY month DESC
        --LIMIT 12
    """).df()

def fetch_patient_data(conn,):
    """Fetch all embryos for a specific patient"""
    return conn.execute(f"""
        SELECT * FROM gold.embryo_image_availability_raw 
        ORDER BY checked_at DESC
    """).df()

## 2. Execute Data Extraction
This cell fetches all required data and closes the database connection.

In [42]:
conn = get_connection()

try:
    print(f"Fetching data at {datetime.now()}...")
    
    # Execute all main queries
    df_overall = fetch_overall_summary(conn)
    df_servers = fetch_server_summary(conn)
    df_temporal = fetch_temporal_distribution(conn)
    
    # Optional: Fetch specific patient for demonstration
    df_patient = fetch_patient_data(conn)
    
    print("Done! All data cached in memory.")

finally:
    conn.close()
    print("Database connection closed.")

Fetching data at 2026-01-28 15:16:30.090475...
Done! All data cached in memory.
Database connection closed.


## 3. Patient Data Example
Showing data for prontuario used in the extraction cell.

In [43]:
prontuario = 515863
df_demo_patient = df_patient[df_patient.prontuario==prontuario]

print(f"\n{'='*80}")
print(f"PATIENT DATA EXAMPLE (Found {len(df_demo_patient)} embryos)")
print(f"{'='*80}")
display(df_demo_patient)


PATIENT DATA EXAMPLE (Found 8 embryos)


Unnamed: 0,prontuario,patient_PatientID,patient_PatientIDx,patient_unit_huntington,treatment_TreatmentName,embryo_EmbryoID,embryo_EmbryoDate,image_available,image_runs_count,api_response_status,api_response_code,error_message,checked_at
84029,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-8,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:56,389"
84031,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-7,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:56,272"
84032,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-6,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:56,154"
84033,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-5,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:56,044"
84034,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-4,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:55,891"
84035,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-3,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:55,775"
84036,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-2,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:55,672"
84037,515863,515863,PC10T4L7_44816.4821299306,Vila Mariana,2022 - 1516,D2022.09.12_S01815_I3253_P-1,2022-09-12,True,619.0,success,200,OK,"2026-01-27 17:40:55,581"


## 4. Overall Availability Summary

In [44]:
display(df_overall)

Unnamed: 0,api_response_code,status_description,total_embryos,percentage
0,200,OK,89460,77.55
1,204,No images found (Empty response),22599,19.59
2,500,Unexpected error during data access,3292,2.85
3,0,Not Checked,10,0.01


## 5. Summary by Server

In [45]:
print("\n" + "=" * 80)
print("SUMMARY BY SERVER")
print("=" * 80)
display(df_servers)


SUMMARY BY SERVER


Unnamed: 0,server,total_embryos,unique_patients,with_images,without_images,pct_with_images,avg_image_runs
0,Belo Horizonte,18864,1842,6303.0,12561.0,33.41,271.81
1,Brasilia,14613,1533,8926.0,5687.0,61.08,497.46
2,Ibirapuera,52107,5094,44970.0,7137.0,86.3,690.2
3,Vila Mariana,29777,2706,29261.0,516.0,98.27,758.7


## 6. Temporal Distribution (Last 12 Months)

In [47]:
print("\n" + "=" * 80)
print("TEMPORAL DISTRIBUTION")
print("=" * 80)
display(df_temporal.head(12))


TEMPORAL DISTRIBUTION


Unnamed: 0,month,total_embryos,with_images,pct_with_images
0,2026-01-01,151,149.0,98.68
1,2025-12-01,842,469.0,55.7
2,2025-11-01,1522,815.0,53.55
3,2025-10-01,1303,744.0,57.1
4,2025-09-01,1394,791.0,56.74
5,2025-08-01,1212,673.0,55.53
6,2025-07-01,1326,702.0,52.94
7,2025-06-01,1690,998.0,59.05
8,2025-05-01,1741,1001.0,57.5
9,2025-04-01,1971,1227.0,62.25
