# Embryo Image Availability Report - Data Explorer

This notebook queries and explores the image availability data created by the report scripts.

## Tables:
- **gold.embryo_image_availability_raw** - Raw table with all original columns + API check status

In [6]:
import duckdb
import pandas as pd
import os
from datetime import datetime

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

## Database Connection

In [7]:
# Connect to huntington_data_lake database
db_path = os.path.join('..', '..', '..', 'database', 'huntington_data_lake.duckdb')
conn = duckdb.connect(db_path, read_only=True)

print(f"Connected to: {db_path}")

Connected to: ..\..\..\database\huntington_data_lake.duckdb


## Table Overview

In [8]:
# Check if table exists
table_check = conn.execute("""
    SELECT table_name 
    FROM information_schema.tables 
    WHERE table_schema = 'gold' 
    AND table_name = 'embryo_image_availability_raw'
""").df()

if len(table_check) > 0:
    print("✓ Table gold.embryo_image_availability_raw exists")
else:
    print("✗ Table gold.embryo_image_availability_raw does not exist yet")
    print("  Run the script 01_check_image_availability.py first")

✗ Table gold.embryo_image_availability_raw does not exist yet
  Run the script 01_check_image_availability.py first


In [9]:
# Get table schema
schema = conn.execute("""
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'gold' 
    AND table_name = 'embryo_image_availability_raw'
    ORDER BY ordinal_position
""").df()

print("\nTable Schema:")
schema


Table Schema:


Unnamed: 0,column_name,data_type


In [None]:
# Get row count
row_count = conn.execute("""
    SELECT COUNT(*) as total_rows
    FROM gold.embryo_image_availability_raw
""").df()

print(f"Total rows: {row_count['total_rows'][0]:,}")

CatalogException: Catalog Error: Table with name embryo_image_availability_raw does not exist!
Did you mean "embryo_images_metadata"?

LINE 3:     FROM gold.embryo_image_availability_raw
                 ^

: 

## Summary Statistics

In [None]:
# Overall summary
overall_summary = conn.execute("""
    SELECT 
        COUNT(*) as total_embryos,
        COUNT(DISTINCT prontuario) as unique_patients,
        COUNT(DISTINCT patient_unit_huntington) as unique_servers,
        SUM(CASE WHEN image_available THEN 1 ELSE 0 END) as embryos_with_images,
        SUM(CASE WHEN NOT image_available THEN 1 ELSE 0 END) as embryos_without_images,
        ROUND(AVG(CASE WHEN image_available THEN 1.0 ELSE 0.0 END) * 100, 2) as pct_with_images,
        MIN(checked_at) as first_check,
        MAX(checked_at) as last_check
    FROM gold.embryo_image_availability_raw
""").df()

print("=" * 80)
print("OVERALL SUMMARY")
print("=" * 80)
overall_summary

In [None]:
# Summary by server
server_summary = conn.execute("""
    SELECT 
        patient_unit_huntington as server,
        COUNT(*) as total_embryos,
        COUNT(DISTINCT prontuario) as unique_patients,
        SUM(CASE WHEN image_available THEN 1 ELSE 0 END) as with_images,
        SUM(CASE WHEN NOT image_available THEN 1 ELSE 0 END) as without_images,
        ROUND(AVG(CASE WHEN image_available THEN 1.0 ELSE 0.0 END) * 100, 2) as pct_with_images,
        ROUND(AVG(image_runs_count), 2) as avg_image_runs
    FROM gold.embryo_image_availability_raw
    GROUP BY patient_unit_huntington
    ORDER BY patient_unit_huntington
""").df()

print("\n" + "=" * 80)
print("SUMMARY BY SERVER")
print("=" * 80)
server_summary

In [None]:
# API response status breakdown
status_summary = conn.execute("""
    SELECT 
        api_response_status,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
    FROM gold.embryo_image_availability_raw
    GROUP BY api_response_status
    ORDER BY count DESC
""").df()

print("\n" + "=" * 80)
print("API RESPONSE STATUS BREAKDOWN")
print("=" * 80)
status_summary

## Sample Data

In [None]:
# Sample of embryos WITH images
with_images_sample = conn.execute("""
    SELECT 
        prontuario,
        patient_unit_huntington,
        embryo_EmbryoID,
        embryo_EmbryoDate,
        image_runs_count,
        api_response_status,
        checked_at
    FROM gold.embryo_image_availability_raw
    WHERE image_available = true
    ORDER BY image_runs_count DESC
    LIMIT 10
""").df()

print("\n" + "=" * 80)
print("SAMPLE: EMBRYOS WITH IMAGES (Top 10 by image runs count)")
print("=" * 80)
with_images_sample

In [None]:
# Sample of embryos WITHOUT images
without_images_sample = conn.execute("""
    SELECT 
        prontuario,
        patient_unit_huntington,
        embryo_EmbryoID,
        embryo_EmbryoDate,
        api_response_status,
        error_message,
        checked_at
    FROM gold.embryo_image_availability_raw
    WHERE image_available = false
    LIMIT 10
""").df()

print("\n" + "=" * 80)
print("SAMPLE: EMBRYOS WITHOUT IMAGES (First 10)")
print("=" * 80)
without_images_sample

## Query by Prontuario

In [None]:
# Query specific prontuario
prontuario = 739327  # Change this to query a different patient

patient_data = conn.execute(f"""
    SELECT *
    FROM gold.embryo_image_availability_raw
    WHERE prontuario = {prontuario}
    ORDER BY embryo_EmbryoDate DESC
""").df()

print(f"\n{'=' * 80}")
print(f"DATA FOR PRONTUARIO: {prontuario}")
print(f"{'=' * 80}")
print(f"Total embryos: {len(patient_data)}")
print(f"With images: {patient_data['image_available'].sum()}")
print(f"Without images: {(~patient_data['image_available']).sum()}")
print("\n")
patient_data

## Distribution Analysis

In [None]:
# Image runs distribution
runs_distribution = conn.execute("""
    SELECT 
        image_runs_count,
        COUNT(*) as embryo_count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
    FROM gold.embryo_image_availability_raw
    WHERE image_available = true
    GROUP BY image_runs_count
    ORDER BY image_runs_count
""").df()

print("\n" + "=" * 80)
print("IMAGE RUNS DISTRIBUTION (for embryos with images)")
print("=" * 80)
runs_distribution.head(20)

In [None]:
# Temporal distribution
temporal_dist = conn.execute("""
    SELECT 
        DATE_TRUNC('month', embryo_EmbryoDate) as month,
        COUNT(*) as total_embryos,
        SUM(CASE WHEN image_available THEN 1 ELSE 0 END) as with_images,
        ROUND(AVG(CASE WHEN image_available THEN 1.0 ELSE 0.0 END) * 100, 2) as pct_with_images
    FROM gold.embryo_image_availability_raw
    GROUP BY DATE_TRUNC('month', embryo_EmbryoDate)
    ORDER BY month DESC
    LIMIT 12
""").df()

print("\n" + "=" * 80)
print("TEMPORAL DISTRIBUTION (Last 12 months)")
print("=" * 80)
temporal_dist

## Error Analysis

In [None]:
# Errors by server
errors_by_server = conn.execute("""
    SELECT 
        patient_unit_huntington as server,
        api_response_status,
        COUNT(*) as count
    FROM gold.embryo_image_availability_raw
    WHERE api_response_status != 'success'
    GROUP BY patient_unit_huntington, api_response_status
    ORDER BY patient_unit_huntington, count DESC
""").df()

print("\n" + "=" * 80)
print("ERRORS BY SERVER")
print("=" * 80)
if len(errors_by_server) > 0:
    display(errors_by_server)
else:
    print("No errors found!")

In [None]:
# Sample error messages
error_messages = conn.execute("""
    SELECT 
        patient_unit_huntington,
        api_response_status,
        error_message,
        COUNT(*) as count
    FROM gold.embryo_image_availability_raw
    WHERE error_message IS NOT NULL
    GROUP BY patient_unit_huntington, api_response_status, error_message
    ORDER BY count DESC
    LIMIT 10
""").df()

print("\n" + "=" * 80)
print("ERROR MESSAGES (Top 10)")
print("=" * 80)
if len(error_messages) > 0:
    display(error_messages)
else:
    print("No error messages found!")

## Close Connection

In [None]:
conn.close()
print("Database connection closed.")