# Embryo Image Availability Report

This notebook provides an interactive analysis of embryo image availability across Huntington units, using data from the Medallion pipeline (Silver/Gold layers).

## 1. Setup and Data Loading

In [2]:
import duckdb
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from pathlib import Path
from datetime import datetime

# Set project root and database path
# When running from embryoscope/report/explore/
DB_PATH = '../../../database/huntington_data_lake.duckdb'

def load_data():
    conn = duckdb.connect(DB_PATH, read_only=True)
    
    # Load latest silver data (primary reference)
    df = conn.execute("""
        SELECT 
            *,
            CAST(embryo_EmbryoDate AS DATE) as date_partition,
            STRFTIME(embryo_EmbryoDate, '%Y-%m') as month_partition
        FROM silver.embryo_image_availability_latest
    """).df()
    
    # Load status changes from gold (for audit logic)
    changes_df = conn.execute("SELECT * FROM gold.embryo_image_status_changes").df()
    
    conn.close()
    return df, changes_df

df, changes_df = load_data()
print(f"Loaded {len(df):,} unique embryos.")
print(f"Loaded {len(changes_df):,} status changes.")

df.head()

Loaded 115,361 unique embryos.
Loaded 0 status changes.


Unnamed: 0,embryo_EmbryoID,prontuario,patient_PatientID,patient_PatientIDx,patient_unit_huntington,treatment_TreatmentName,embryo_EmbryoDate,image_available,image_runs_count,api_response_status,api_response_code,error_message,checked_at,last_updated,date_partition,month_partition
0,D2026.01.15_S02549_I4120_P-15,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:27,790",2026-01-30 15:56:09.523,2026-01-15,2026-01
1,D2026.01.15_S02549_I4120_P-4,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:28,223",2026-01-30 15:56:09.523,2026-01-15,2026-01
2,D2026.01.15_S02549_I4120_P-16,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:27,901",2026-01-30 15:56:09.523,2026-01-15,2026-01
3,D2026.01.15_S02549_I4120_P-3,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:28,090",2026-01-30 15:56:09.523,2026-01-15,2026-01
4,D2026.01.15_S02549_I4120_P-2,739327,739327,S4JP1389_46037.5604639005,Brasilia,29/26 PLACA 1,2026-01-15,True,852.0,success,200,OK,"2026-01-27 19:12:27,995",2026-01-30 15:56:09.523,2026-01-15,2026-01


## 2. High-Level KPIs

Summary of availability and server health.

In [3]:
# Calculate basic counts
total = len(df)
success_count = df[df['image_available'] == True].shape[0]
availability_rate = (success_count / total) * 100 if total > 0 else 0

# Nuanced Status Mapping (Interpretation Layer)
def interpret_status(row):
    code = row['api_response_code']
    has_images = row['image_available']
    
    if code == 200:
        return 'Success (Images Found)' if has_images else 'Success (No Images Found)'
    elif code == 204:
        return 'Success (Empty/No Context)'
    elif code == 500:
        return 'Error (Server 500)'
    elif code == 0:
        return 'Error (Initialization/Network)'
    else:
        return f'Other ({code})'

df['Status_Label'] = df.apply(interpret_status, axis=1)
status_dist = df.groupby('Status_Label').size().reset_index(name='count').sort_values('count', ascending=False)

# Build visualization
fig = make_subplots(
    rows=1, cols=2, 
    specs=[[{'type':'indicator'}, {'type':'xy'}]],
    column_widths=[0.4, 0.6]
)

fig.add_trace(go.Indicator(
    mode = "gauge+number",
    value = availability_rate,
    title = {'text': "Overall Image Availability (%)"},
    gauge = {'axis': {'range': [0, 100]}}, 
    domain = {'x': [0, 1], 'y': [0, 1]}
), row=1, col=1)

fig.add_trace(go.Bar(
    x=status_dist['Status_Label'], 
    y=status_dist['count'],
    text=status_dist['count'],
    textposition='auto', 
    name="Status Distribution",
    marker_color='royalblue'
), row=1, col=2)

fig.update_layout(title_text="Pipeline Health & Overall Availability", height=400, showlegend=False)
fig.show()

## 3. Unit & Temporal Analysis

Exploring performance across locations and time.

In [4]:
# Availability by Unit
unit_stats = df.groupby('patient_unit_huntington').agg(
    total=('embryo_EmbryoID', 'count'),
    available=('image_available', 'sum')
).reset_index()
unit_stats['Availability %'] = (unit_stats['available'] / unit_stats['total'] * 100).round(1)

fig_unit = px.bar(
    unit_stats.sort_values('Availability %', ascending=False), 
    x='patient_unit_huntington', y='Availability %', 
    color='Availability %', text='Availability %',
    title="Availability by Huntington Unit"
)
fig_unit.update_traces(textposition='outside')
fig_unit.show()

# Availability over Time (Monthly per Unit)
monthly_unit_stats = df.groupby(['month_partition', 'patient_unit_huntington']).agg(
    total=('embryo_EmbryoID', 'count'),
    available=('image_available', 'sum')
).reset_index()
monthly_unit_stats['Availability %'] = (monthly_unit_stats['available'] / monthly_unit_stats['total'] * 100).round(1)

fig_month = px.line(
    monthly_unit_stats, x='month_partition', y='Availability %', color='patient_unit_huntington',
    markers=True, title="Monthly Availability Trend by Unit"
)
fig_month.show()

## 4. Heatmap: Unit vs Month

Spotting gaps in historical data.

In [5]:
# Pivot table for heatmap
pivot_df = df.pivot_table(
    index='patient_unit_huntington', 
    columns='month_partition', 
    values='image_available', 
    aggfunc='mean'
) * 100

fig_heat = px.imshow(
    pivot_df, 
    labels=dict(x="Month", y="Unit", color="Availability %"),
    title="Availability Heatmap: Unit vs Month",
    color_continuous_scale="Viridis"
)
fig_heat.show()

## 5. Audit Deep Dive (Status Changes)

Tracking how many embryos were "recovered" by retries.

In [6]:
if not changes_df.empty:
    # Aggregate types of changes
    changes_agg = changes_df.groupby(['old_status', 'new_status']).size().reset_index(name='count')
    
    fig_audit = px.bar(
        changes_agg, 
        x='new_status', y='count', color='old_status',
        title="Status Transitions (Recovery Tracking)",
        labels={'count': 'Embryos', 'new_status': 'Current Status', 'old_status': 'Previous Status'}
    )
    fig_audit.show()
else:
    print("No status changes recorded in the Gold table yet.")

No status changes recorded in the Gold table yet.


## 6. Clinical Analytics: Patient & Treatment Level

Availability metrics focused on clinical groupings.

In [7]:
# Group by Patient and Treatment
clinical_df = df.groupby(['patient_PatientID', 'treatment_TreatmentName']).agg(
    total_embryos=('embryo_EmbryoID', 'count'),
    available_embryos=('image_available', 'sum'),
    # last_check=('checked_at', 'max')
).reset_index()

clinical_df['Availability %'] = (clinical_df['available_embryos'] / clinical_df['total_embryos'] * 100).round(1)

print(f"Displaying top 15 treatments with lowest availability:")
display(clinical_df[clinical_df['total_embryos'] > 0].sort_values('Availability %').head(15))

Displaying top 15 treatments with lowest availability:


Unnamed: 0,patient_PatientID,treatment_TreatmentName,total_embryos,available_embryos,Availability %
15403,580032536,2021-217,4,0,0.0
15402,50034712,2021-519,8,0,0.0
2,888,03/10/2020,5,0,0.0
3,888,12/11/2019*,4,0,0.0
4,888,14/02/2020,6,0,0.0
15401,50034711,2021-691,12,0,0.0
15400,50034694,2021-564,14,0,0.0
24,8356,08/11/2019,8,0,0.0
25,8356,28/02/2020,7,0,0.0
15380,50034576,2021-454,14,0,0.0


## 7. Attention Required

Embryos stuck in `Error` or `No Content` status (Top 10 per Unit).

In [8]:
# Filter embryos with issues (Anything not a success with images)
issues_df = df[~((df['api_response_code'] == 200) & (df['image_available'] == True))].copy()

print(f"Found {len(issues_df):,} embryos requiring attention (errors, empty, or not found).\n")

# Get top 10 treatments per unit with issues
priority_df = issues_df.groupby(['patient_unit_huntington', 'treatment_TreatmentName']).agg(
    embryos_missing=('embryo_EmbryoID', 'count'),
    error_sample=('error_message', 'first'),
    last_code=('api_response_code', 'last')
).reset_index().sort_values(['patient_unit_huntington', 'embryos_missing'], ascending=[True, False])

for unit in priority_df['patient_unit_huntington'].unique():
    print(f"--- {unit} (Top 10 Priority Treatments) ---")
    unit_top = priority_df[priority_df['patient_unit_huntington'] == unit].head(10)
    display(unit_top)
    print("\n")

Found 25,901 embryos requiring attention (errors, empty, or not found).

--- Belo Horizonte (Top 10 Priority Treatments) ---


Unnamed: 0,patient_unit_huntington,treatment_TreatmentName,embryos_missing,error_sample,last_code
861,Belo Horizonte,28/11/2020,74,No images found (Empty response),204
161,Belo Horizonte,06/12/2019,63,No images found (Empty response),204
352,Belo Horizonte,12/11/2021,62,No images found (Empty response),204
611,Belo Horizonte,21/01/2021,62,No images found (Empty response),204
600,Belo Horizonte,20/08/2022,59,No images found (Empty response),204
562,Belo Horizonte,19/06/2021,57,No images found (Empty response),204
155,Belo Horizonte,06/09/2023,53,No images found (Empty response),204
708,Belo Horizonte,23/10/2021,53,No images found (Empty response),204
228,Belo Horizonte,09/06/2021,51,No images found (Empty response),204
437,Belo Horizonte,15/07/2022,51,No images found (Empty response),204




--- Brasilia (Top 10 Priority Treatments) ---


Unnamed: 0,patient_unit_huntington,treatment_TreatmentName,embryos_missing,error_sample,last_code
1122,Brasilia,2020-681,26,No images found (Empty response),204
1346,Brasilia,2021-262,26,No images found (Empty response),204
1398,Brasilia,2021-396,24,No images found (Empty response),204
1290,Brasilia,2021-145,23,No images found (Empty response),204
1311,Brasilia,2021-180,23,No images found (Empty response),204
1414,Brasilia,2021-433,23,No images found (Empty response),204
1390,Brasilia,2021-37,22,No images found (Empty response),204
1496,Brasilia,2021-642,22,No images found (Empty response),204
1103,Brasilia,2020-645,21,No images found (Empty response),204
1460,Brasilia,2021-563,21,No images found (Empty response),204




--- Ibirapuera (Top 10 Priority Treatments) ---


Unnamed: 0,patient_unit_huntington,treatment_TreatmentName,embryos_missing,error_sample,last_code
2598,Ibirapuera,2025-2028,36,Unexpected error during data access,500
2783,Ibirapuera,2025-2674,29,Unexpected error during data access,500
2666,Ibirapuera,2025-2340,27,Unexpected error during data access,500
2788,Ibirapuera,2025-2687,27,Unexpected error during data access,500
3004,Ibirapuera,2025-750,27,No images found (Empty response),500
2295,Ibirapuera,2025 - 883,26,Unexpected error during data access,500
2553,Ibirapuera,2025-1902,26,No images found (Empty response),500
3014,Ibirapuera,2025-792,26,Unexpected error during data access,500
2213,Ibirapuera,2025 - 1355,25,No images found (Empty response),500
2264,Ibirapuera,2025 - 2699,25,Unexpected error during data access,500




--- Vila Mariana (Top 10 Priority Treatments) ---


Unnamed: 0,patient_unit_huntington,treatment_TreatmentName,embryos_missing,error_sample,last_code
3461,Vila Mariana,2025 - 62,14,No images found (Empty response),204
3190,Vila Mariana,2022 - 1335,10,No images found (Empty response),204
3344,Vila Mariana,2024 - 1606,10,No images found (Empty response),204
3486,Vila Mariana,2025-633,10,No images found (Empty response),204
3146,Vila Mariana,2021 - 1288,9,No images found (Empty response),204
3191,Vila Mariana,2022 - 1349,7,No images found (Empty response),204
3194,Vila Mariana,2022 - 1441,5,No images found (Empty response),204
3202,Vila Mariana,2022 - 1763,4,No images found (Empty response),204
3385,Vila Mariana,2024 - 524,3,No images found (Empty response),204
3393,Vila Mariana,2024 - 665,3,No images found (Empty response),204




