In [2]:
import duckdb
import pandas as pd

# Connect to the data lake
db_path = '../database/huntington_data_lake.duckdb'

In [3]:
# Category 1 Analysis: PatientIDs that exist in Clinisys but no match found
conn = duckdb.connect(db_path)

# Get unmatched embryos from recent years (2022-2025)
unmatched_recent_query = """
SELECT DISTINCT
    embryo_EmbryoID,
    embryo_FertilizationTime,
    embryo_embryo_number,
    patient_PatientID,
    treatment_TreatmentName,
    YEAR(CAST(embryo_FertilizationTime AS DATE)) as year
FROM gold.embryoscope_embrioes
WHERE embryo_EmbryoID NOT IN (
    SELECT DISTINCT embryo_EmbryoID 
    FROM gold.embryoscope_clinisys_combined 
    WHERE embryo_EmbryoID IS NOT NULL
)
AND YEAR(CAST(embryo_FertilizationTime AS DATE)) >= 2022
AND embryo_FertilizationTime IS NOT NULL
ORDER BY embryo_FertilizationTime DESC
"""

unmatched_recent_df = con.execute(unmatched_recent_query).df()

# Get unique PatientIDs from Clinisys
clinisys_patientids = con.execute("""
    SELECT DISTINCT micro_prontuario
    FROM gold.clinisys_embrioes
    WHERE micro_prontuario IS NOT NULL
""").fetchdf()['micro_prontuario'].tolist()

# Filter for Category 1: PatientID exists in Clinisys but no match found
valid_patientid_embryos = unmatched_recent_df[unmatched_recent_df['patient_PatientID'].notna()]
valid_patientids = valid_patientid_embryos['patient_PatientID'].unique()
found_in_clinisys = [pid for pid in valid_patientids if pid in clinisys_patientids]

embryos_with_found_patientid = valid_patientid_embryos[
    valid_patientid_embryos['patient_PatientID'].isin(found_in_clinisys)
]

# Group by PatientID to see detailed breakdown
category1_summary = embryos_with_found_patientid.groupby('patient_PatientID').agg({
    'embryo_EmbryoID': 'count',
    'year': 'nunique',
    'embryo_embryo_number': ['min', 'max'],
    'embryo_FertilizationTime': ['min', 'max']
}).round(2)

category1_summary.columns = ['embryo_count', 'year_count', 'min_embryo_num', 'max_embryo_num', 'min_date', 'max_date']
category1_summary = category1_summary.sort_values('embryo_count', ascending=False)

print("=" * 80)
print("CATEGORY 1: PatientIDs that exist in Clinisys but no match found")
print("=" * 80)
print(f"Total embryos: {len(embryos_with_found_patientid):,}")
print(f"Unique PatientIDs: {len(found_in_clinisys):,}")
print(f"Reason: Date/embryo number mismatches or missing Clinisys records")
print()

# Display the complete list
print("Complete list of Category 1 PatientIDs:")
print("-" * 80)
print(f"{'PatientID':<12} {'Embryos':<10} {'Years':<8} {'Embryo Range':<15} {'Date Range':<25}")
print("-" * 80)

for patientid, row in category1_summary.iterrows():
    embryo_range = f"{row['min_embryo_num']:.0f}-{row['max_embryo_num']:.0f}"
    date_range = f"{row['min_date'].strftime('%Y-%m-%d')} to {row['max_date'].strftime('%Y-%m-%d')}"
    print(f"{patientid:<12} {row['embryo_count']:<10.0f} {row['year_count']:<8.0f} {embryo_range:<15} {date_range:<25}")

# Show summary statistics
print()
print("Summary Statistics:")
print(f"• Average embryos per PatientID: {category1_summary['embryo_count'].mean():.1f}")
print(f"• Median embryos per PatientID: {category1_summary['embryo_count'].median():.1f}")
print(f"• PatientIDs with >10 embryos: {len(category1_summary[category1_summary['embryo_count'] > 10])}")
print(f"• PatientIDs with >20 embryos: {len(category1_summary[category1_summary['embryo_count'] > 20])}")

# Store Category 1 PatientIDs for tracking
category1_patientids = category1_summary.index.tolist()
print(f"\nCategory 1 PatientIDs stored for tracking: {len(category1_patientids)}")

con.close()


CATEGORY 1: PatientIDs that exist in Clinisys but no match found
Total embryos: 1,575
Unique PatientIDs: 348
Reason: Date/embryo number mismatches or missing Clinisys records

Complete list of Category 1 PatientIDs:
--------------------------------------------------------------------------------
PatientID    Embryos    Years    Embryo Range    Date Range               
--------------------------------------------------------------------------------
753333       36         1        43-78           2022-10-28 to 2022-11-02 
788836       29         1        20-48           2024-05-21 to 2024-05-21 
747405       26         1        6-20            2024-02-28 to 2024-02-28 
830027       24         1        13-36           2024-05-01 to 2024-05-01 
139138       22         1        14-35           2022-04-10 to 2022-04-10 
813209       21         2        1-26            2024-01-31 to 2025-06-24 
824285       17         1        9-25            2024-02-28 to 2024-02-28 
222030       16       

In [7]:
# Category 2 Analysis: PatientIDs that do not exist in Clinisys
con = duckdb.connect(db_path)

# Get unmatched embryos from recent years (2022-2025)
unmatched_recent_query = """
SELECT DISTINCT
    embryo_EmbryoID,
    embryo_FertilizationTime,
    embryo_embryo_number,
    patient_PatientID,
    treatment_TreatmentName,
    YEAR(CAST(embryo_FertilizationTime AS DATE)) as year
FROM gold.embryoscope_embrioes
WHERE embryo_EmbryoID NOT IN (
    SELECT DISTINCT embryo_EmbryoID 
    FROM gold.embryoscope_clinisys_combined 
    WHERE embryo_EmbryoID IS NOT NULL
)
AND YEAR(CAST(embryo_FertilizationTime AS DATE)) >= 2022
AND embryo_FertilizationTime IS NOT NULL
ORDER BY embryo_FertilizationTime DESC
"""

unmatched_recent_df = con.execute(unmatched_recent_query).df()

# Get unique PatientIDs from Clinisys
clinisys_patientids = con.execute("""
    SELECT DISTINCT micro_prontuario
    FROM gold.clinisys_embrioes
    WHERE micro_prontuario IS NOT NULL
""").fetchdf()['micro_prontuario'].tolist()

# Filter for Category 2: PatientID does not exist in Clinisys
valid_patientid_embryos = unmatched_recent_df[unmatched_recent_df['patient_PatientID'].notna()]
valid_patientids = valid_patientid_embryos['patient_PatientID'].unique()
not_found_in_clinisys = [pid for pid in valid_patientids if pid not in clinisys_patientids]

embryos_with_notfound_patientid = valid_patientid_embryos[
    valid_patientid_embryos['patient_PatientID'].isin(not_found_in_clinisys)
]

# Group by PatientID to see detailed breakdown
category2_summary = embryos_with_notfound_patientid.groupby('patient_PatientID').agg({
    'embryo_EmbryoID': 'count',
    'year': 'nunique',
    'embryo_embryo_number': ['min', 'max'],
    'embryo_FertilizationTime': ['min', 'max']
}).round(2)

category2_summary.columns = ['embryo_count', 'year_count', 'min_embryo_num', 'max_embryo_num', 'min_date', 'max_date']
category2_summary = category2_summary.sort_values('embryo_count', ascending=False)

print("=" * 80)
print("CATEGORY 2: PatientIDs that do not exist in Clinisys")
print("=" * 80)
print(f"Total embryos: {len(embryos_with_notfound_patientid):,}")
print(f"Unique PatientIDs: {len(not_found_in_clinisys):,}")
print(f"Reason: Data coverage gap - PatientID not in Clinisys system")
print()

# Display the complete list
print("Complete list of Category 2 PatientIDs:")
print("-" * 80)
print(f"{'PatientID':<12} {'Embryos':<10} {'Years':<8} {'Embryo Range':<15} {'Date Range':<25}")
print("-" * 80)

for patientid, row in category2_summary.iterrows():
    embryo_range = f"{row['min_embryo_num']:.0f}-{row['max_embryo_num']:.0f}"
    date_range = f"{row['min_date'].strftime('%Y-%m-%d')} to {row['max_date'].strftime('%Y-%m-%d')}"
    print(f"{patientid:<12} {row['embryo_count']:<10.0f} {row['year_count']:<8.0f} {embryo_range:<15} {date_range:<25}")

# Data quality analysis
print()
print("Data Quality Analysis:")
suspicious_patientids = []
for patientid in not_found_in_clinisys:
    if patientid == 1 or patientid < 100 or patientid > 999999:
        suspicious_patientids.append(patientid)

if suspicious_patientids:
    print(f"• Suspicious PatientIDs found: {len(suspicious_patientids)}")
    print(f"• Examples: {suspicious_patientids[:10]}")  # Show first 10
    print(f"• These may be data quality issues or placeholder values")
else:
    print("• No suspicious PatientIDs detected")

# Show summary statistics
print()
print("Summary Statistics:")
print(f"• Average embryos per PatientID: {category2_summary['embryo_count'].mean():.1f}")
print(f"• Median embryos per PatientID: {category2_summary['embryo_count'].median():.1f}")
print(f"• PatientIDs with >10 embryos: {len(category2_summary[category2_summary['embryo_count'] > 10])}")
print(f"• PatientIDs with >20 embryos: {len(category2_summary[category2_summary['embryo_count'] > 20])}")

# Store Category 2 PatientIDs for tracking
category2_patientids = category2_summary.index.tolist()
print(f"\nCategory 2 PatientIDs stored for tracking: {len(category2_patientids)}")

con.close()


CATEGORY 2: PatientIDs that do not exist in Clinisys
Total embryos: 5,772
Unique PatientIDs: 655
Reason: Data coverage gap - PatientID not in Clinisys system

Complete list of Category 2 PatientIDs:
--------------------------------------------------------------------------------
PatientID    Embryos    Years    Embryo Range    Date Range               
--------------------------------------------------------------------------------
1            336        4        1-192           2022-05-02 to 2025-06-24 
64645        55         3        1-18            2022-02-04 to 2025-05-21 
50030860     42         1        1-11            2022-02-05 to 2022-08-26 
33520        40         2        1-27            2022-04-15 to 2023-02-01 
754409       37         2        1-10            2022-11-05 to 2023-07-29 
59360        37         2        1-16            2022-12-15 to 2023-10-17 
60579        35         2        1-10            2022-01-18 to 2024-04-13 
78770        34         1        1-21  

In [8]:
# Comprehensive PatientID Tracking Across All Tables

con = duckdb.connect(db_path)

def track_patientid_across_tables(patientid, con):
    """Track a specific PatientID across all relevant tables."""
    
    results = {}
    
    # 1. Check in Embryoscope tables
    try:
        # Gold layer
        embryoscope_gold = con.execute("""
            SELECT 
                'gold.embryoscope_embrioes' as table_name,
                COUNT(*) as record_count,
                COUNT(DISTINCT embryo_EmbryoID) as unique_embryos,
                MIN(embryo_FertilizationTime) as min_date,
                MAX(embryo_FertilizationTime) as max_date,
                MIN(embryo_embryo_number) as min_embryo_num,
                MAX(embryo_embryo_number) as max_embryo_num
            FROM gold.embryoscope_embrioes
            WHERE patient_PatientID = ?
        """, [patientid]).fetchone()
        
        if embryoscope_gold[1] > 0:
            results['embryoscope_gold'] = {
                'table': embryoscope_gold[0],
                'records': embryoscope_gold[1],
                'unique_embryos': embryoscope_gold[2],
                'date_range': f"{embryoscope_gold[3]} to {embryoscope_gold[4]}",
                'embryo_range': f"{embryoscope_gold[5]}-{embryoscope_gold[6]}"
            }
    except Exception as e:
        results['embryoscope_gold'] = {'error': str(e)}
    
    # 2. Check in Clinisys tables
    try:
        clinisys_gold = con.execute("""
            SELECT 
                'gold.clinisys_embrioes' as table_name,
                COUNT(*) as record_count,
                COUNT(DISTINCT micro_Data_DL) as unique_dates,
                MIN(micro_Data_DL) as min_date,
                MAX(micro_Data_DL) as max_date,
                MIN(oocito_embryo_number) as min_embryo_num,
                MAX(oocito_embryo_number) as max_embryo_num
            FROM gold.clinisys_embrioes
            WHERE micro_prontuario = ?
        """, [patientid]).fetchone()
        
        if clinisys_gold[1] > 0:
            results['clinisys_gold'] = {
                'table': clinisys_gold[0],
                'records': clinisys_gold[1],
                'unique_dates': clinisys_gold[2],
                'date_range': f"{clinisys_gold[3]} to {clinisys_gold[4]}",
                'embryo_range': f"{clinisys_gold[5]}-{clinisys_gold[6]}"
            }
    except Exception as e:
        results['clinisys_gold'] = {'error': str(e)}
    
    # 3. Check in combined table
    try:
        combined = con.execute("""
            SELECT 
                'gold.embryoscope_clinisys_combined' as table_name,
                COUNT(*) as record_count,
                COUNT(DISTINCT embryo_EmbryoID) as unique_embryos,
                MIN(embryo_FertilizationTime) as min_date,
                MAX(embryo_FertilizationTime) as max_date
            FROM gold.embryoscope_clinisys_combined
            WHERE micro_prontuario = ?
        """, [patientid]).fetchone()
        
        if combined[1] > 0:
            results['combined'] = {
                'table': combined[0],
                'records': combined[1],
                'unique_embryos': combined[2],
                'date_range': f"{combined[3]} to {combined[4]}"
            }
    except Exception as e:
        results['combined'] = {'error': str(e)}
    
    # 4. Check in silver layer (if exists)
    try:
        embryoscope_silver = con.execute("""
            SELECT 
                'silver.embryo_data' as table_name,
                COUNT(*) as record_count,
                COUNT(DISTINCT embryo_EmbryoID) as unique_embryos,
                MIN(embryo_FertilizationTime) as min_date,
                MAX(embryo_FertilizationTime) as max_date
            FROM silver.embryo_data
            WHERE patient_PatientID = ?
        """, [patientid]).fetchone()
        
        if embryoscope_silver[1] > 0:
            results['embryoscope_silver'] = {
                'table': embryoscope_silver[0],
                'records': embryoscope_silver[1],
                'unique_embryos': embryoscope_silver[2],
                'date_range': f"{embryoscope_silver[3]} to {embryoscope_silver[4]}"
            }
    except Exception as e:
        results['embryoscope_silver'] = {'error': str(e)}
    
    return results

def analyze_patientid_tracking(category1_patientids, category2_patientids):
    """Analyze tracking for all PatientIDs in both categories."""
    
    con = duckdb.connect('../database/huntington_data_lake.duckdb')
    
    print("=" * 100)
    print("COMPREHENSIVE PATIENTID TRACKING ANALYSIS")
    print("=" * 100)
    
    # Analyze Category 1 PatientIDs
    print(f"\n1. CATEGORY 1 PATIENTID TRACKING ({len(category1_patientids)} PatientIDs)")
    print("-" * 80)
    
    category1_tracking = {}
    for i, patientid in enumerate(category1_patientids[:10]):  # Limit to first 10 for display
        print(f"\nAnalyzing PatientID {patientid} ({i+1}/10)...")
        tracking = track_patientid_across_tables(patientid, con)
        category1_tracking[patientid] = tracking
        
        # Display results
        print(f"  Embryoscope Gold: {tracking.get('embryoscope_gold', {}).get('records', 0)} records")
        print(f"  Clinisys Gold: {tracking.get('clinisys_gold', {}).get('records', 0)} records")
        print(f"  Combined: {tracking.get('combined', {}).get('records', 0)} records")
        print(f"  Embryoscope Silver: {tracking.get('embryoscope_silver', {}).get('records', 0)} records")
    
    # Analyze Category 2 PatientIDs
    print(f"\n2. CATEGORY 2 PATIENTID TRACKING ({len(category2_patientids)} PatientIDs)")
    print("-" * 80)
    
    category2_tracking = {}
    for i, patientid in enumerate(category2_patientids[:10]):  # Limit to first 10 for display
        print(f"\nAnalyzing PatientID {patientid} ({i+1}/10)...")
        tracking = track_patientid_across_tables(patientid, con)
        category2_tracking[patientid] = tracking
        
        # Display results
        print(f"  Embryoscope Gold: {tracking.get('embryoscope_gold', {}).get('records', 0)} records")
        print(f"  Clinisys Gold: {tracking.get('clinisys_gold', {}).get('records', 0)} records")
        print(f"  Combined: {tracking.get('combined', {}).get('records', 0)} records")
        print(f"  Embryoscope Silver: {tracking.get('embryoscope_silver', {}).get('records', 0)} records")
    
    # Summary statistics
    print(f"\n3. TRACKING SUMMARY")
    print("-" * 50)
    
    # Category 1 summary
    cat1_embryoscope_count = sum(1 for tracking in category1_tracking.values() 
                                if tracking.get('embryoscope_gold', {}).get('records', 0) > 0)
    cat1_clinisys_count = sum(1 for tracking in category1_tracking.values() 
                             if tracking.get('clinisys_gold', {}).get('records', 0) > 0)
    cat1_combined_count = sum(1 for tracking in category1_tracking.values() 
                             if tracking.get('combined', {}).get('records', 0) > 0)
    
    print(f"Category 1 (sampled {len(category1_tracking)} PatientIDs):")
    print(f"  • Found in Embryoscope: {cat1_embryoscope_count}")
    print(f"  • Found in Clinisys: {cat1_clinisys_count}")
    print(f"  • Found in Combined: {cat1_combined_count}")
    
    # Category 2 summary
    cat2_embryoscope_count = sum(1 for tracking in category2_tracking.values() 
                                if tracking.get('embryoscope_gold', {}).get('records', 0) > 0)
    cat2_clinisys_count = sum(1 for tracking in category2_tracking.values() 
                             if tracking.get('clinisys_gold', {}).get('records', 0) > 0)
    cat2_combined_count = sum(1 for tracking in category2_tracking.values() 
                             if tracking.get('combined', {}).get('records', 0) > 0)
    
    print(f"\nCategory 2 (sampled {len(category2_tracking)} PatientIDs):")
    print(f"  • Found in Embryoscope: {cat2_embryoscope_count}")
    print(f"  • Found in Clinisys: {cat2_clinisys_count}")
    print(f"  • Found in Combined: {cat2_combined_count}")
    
    con.close()
    
    return category1_tracking, category2_tracking

# Run the comprehensive tracking analysis
# Note: Make sure category1_patientids and category2_patientids are available from previous cells
if 'category1_patientids' in locals() and 'category2_patientids' in locals():
    category1_tracking, category2_tracking = analyze_patientid_tracking(category1_patientids, category2_patientids)
else:
    print("Please run the Category 1 and Category 2 analysis cells first to get the PatientID lists.")


COMPREHENSIVE PATIENTID TRACKING ANALYSIS

1. CATEGORY 1 PATIENTID TRACKING (348 PatientIDs)
--------------------------------------------------------------------------------

Analyzing PatientID 753333 (1/10)...
  Embryoscope Gold: 78 records
  Clinisys Gold: 70 records
  Combined: 70 records
  Embryoscope Silver: 0 records

Analyzing PatientID 788836 (2/10)...
  Embryoscope Gold: 48 records
  Clinisys Gold: 19 records
  Combined: 19 records
  Embryoscope Silver: 0 records

Analyzing PatientID 747405 (3/10)...
  Embryoscope Gold: 38 records
  Clinisys Gold: 8 records
  Combined: 13 records
  Embryoscope Silver: 0 records

Analyzing PatientID 830027 (4/10)...
  Embryoscope Gold: 36 records
  Clinisys Gold: 14 records
  Combined: 14 records
  Embryoscope Silver: 0 records

Analyzing PatientID 139138 (5/10)...
  Embryoscope Gold: 41 records
  Clinisys Gold: 20 records
  Combined: 20 records
  Embryoscope Silver: 0 records

Analyzing PatientID 813209 (6/10)...
  Embryoscope Gold: 54 record

In [31]:
## **Cell 4: Individual PatientID Investigation Tool**

def investigate_specific_patientid(patientid):
    """Investigate a specific PatientID in detail across all tables."""
    
    con = duckdb.connect('../database/huntington_data_lake.duckdb')
    
    print(f"=" * 80)
    print(f"DETAILED INVESTIGATION: PatientID {patientid}")
    print(f"=" * 80)
    
    # 1. Embryoscope Gold Layer
    print(f"\n1. EMBRYOSCOPE GOLD LAYER")
    print("-" * 40)
    try:
        embryoscope_data = con.execute("""
            SELECT 
                embryo_EmbryoID,
                embryo_FertilizationTime,
                embryo_embryo_number,
                patient_PatientID,
                treatment_TreatmentName
            FROM gold.embryoscope_embrioes
            WHERE patient_PatientID = ?
            ORDER BY embryo_FertilizationTime, embryo_embryo_number
        """, [patientid]).fetchdf()
        
        if len(embryoscope_data) > 0:
            print(f"Found {len(embryoscope_data)} records")
            print(embryoscope_data.to_string(index=False))
        else:
            print("No records found")
    except Exception as e:
        print(f"Error: {e}")
    
    # 2. Clinisys Gold Layer
    print(f"\n2. CLINISYS GOLD LAYER")
    print("-" * 40)
    try:
        clinisys_data = con.execute("""
            SELECT 
                micro_prontuario,
                micro_Data_DL,
                oocito_embryo_number,
                micro_peso_total_psd
            FROM gold.clinisys_embrioes
            WHERE micro_prontuario = ?
            ORDER BY micro_Data_DL, oocito_embryo_number
        """, [patientid]).fetchdf()
        
        if len(clinisys_data) > 0:
            print(f"Found {len(clinisys_data)} records")
            print(clinisys_data.to_string(index=False))
        else:
            print("No records found")
    except Exception as e:
        print(f"Error: {e}")
    
    # 3. Combined Table
    print(f"\n3. COMBINED TABLE")
    print("-" * 40)
    try:
        combined_data = con.execute("""
            SELECT 
                embryo_EmbryoID,
                micro_prontuario,
                embryo_FertilizationTime,
                micro_Data_DL,
                embryo_embryo_number,
                oocito_embryo_number
            FROM gold.embryoscope_clinisys_combined
            WHERE micro_prontuario = ?
            ORDER BY embryo_FertilizationTime
        """, [patientid]).fetchdf()
        
        if len(combined_data) > 0:
            print(f"Found {len(combined_data)} records")
            print(combined_data.to_string(index=False))
        else:
            print("No records found")
    except Exception as e:
        print(f"Error: {e}")
    
    # 4. Embryoscope Silver Layer (Fixed - join with patients table)
    print(f"\n4. EMBRYOSCOPE SILVER LAYER")
    print("-" * 40)
    try:
        # First, find the PatientIDx that corresponds to this PatientID in the patients table
        patient_mapping = con.execute("""
            SELECT PatientIDx, PatientID
            FROM silver_embryoscope.patients
            WHERE PatientID = ?
        """, [patientid]).fetchdf()
        
        if len(patient_mapping) > 0:
            patientidx = patient_mapping.iloc[0]['PatientIDx']
            print(f"Found PatientIDx: {patientidx} for PatientID: {patientid}")
            
            # Now query embryo_data using PatientIDx
            silver_data = con.execute("""
                SELECT 
                    e.EmbryoID,
                    e.FertilizationTime,
                    e.embryo_number,
                    e.PatientIDx,
                    p.PatientID
                FROM silver_embryoscope.embryo_data e
                JOIN silver_embryoscope.patients p ON e.PatientIDx = p.PatientIDx
                WHERE p.PatientID = ?
                ORDER BY e.FertilizationTime, e.embryo_number
            """, [patientid]).fetchdf()
            
            if len(silver_data) > 0:
                print(f"Found {len(silver_data)} records")
                print(silver_data.to_string(index=False))
            else:
                print("No records found")
        else:
            print(f"PatientID {patientid} not found in silver_embryoscope.patients table")
            
    except Exception as e:
        print(f"Error: {e}")
    
    # 5. Summary
    print(f"\n5. SUMMARY")
    print("-" * 40)
    print(f"PatientID {patientid}:")
    print(f"  • Embryoscope records: {len(embryoscope_data) if 'embryoscope_data' in locals() else 0}")
    print(f"  • Clinisys records: {len(clinisys_data) if 'clinisys_data' in locals() else 0}")
    print(f"  • Combined records: {len(combined_data) if 'combined_data' in locals() else 0}")
    print(f"  • Silver records: {len(silver_data) if 'silver_data' in locals() else 0}")
    
    # 6. Analysis
    print(f"\n6. ANALYSIS")
    print("-" * 40)
    
    embryoscope_count = len(embryoscope_data) if 'embryoscope_data' in locals() else 0
    clinisys_count = len(clinisys_data) if 'clinisys_data' in locals() else 0
    combined_count = len(combined_data) if 'combined_data' in locals() else 0
    
    if embryoscope_count > 0 and clinisys_count == 0:
        print(f"  • Category 2: PatientID exists in Embryoscope but NOT in Clinisys")
        print(f"  • This explains why no matches were found")
    elif embryoscope_count > 0 and clinisys_count > 0 and combined_count == 0:
        print(f"  • Category 1: PatientID exists in both systems but no matches")
        print(f"  • Potential date/embryo number mismatch")
    elif embryoscope_count > 0 and clinisys_count > 0 and combined_count > 0:
        print(f"  • Some matches found: {combined_count} out of {embryoscope_count} embryos")
        print(f"  • Missing matches: {embryoscope_count - combined_count} embryos")
    else:
        print(f"  • No data found in any table")
    
    con.close()

In [32]:

# Example usage:
investigate_specific_patientid(161746)  # Category 1 example
# investigate_specific_patientid(1)       # Category 2 example


DETAILED INVESTIGATION: PatientID 161746

1. EMBRYOSCOPE GOLD LAYER
----------------------------------------
Found 13 records
             embryo_EmbryoID embryo_FertilizationTime  embryo_embryo_number  patient_PatientID treatment_TreatmentName
D2025.07.12_S04193_I3027_P-1      2025-07-12 11:35:00                     1             161746             2025 - 1433
D2025.07.12_S04193_I3027_P-2      2025-07-12 11:35:00                     2             161746             2025 - 1433
D2025.07.12_S04193_I3027_P-3      2025-07-12 11:35:00                     3             161746             2025 - 1433
D2025.07.12_S04193_I3027_P-4      2025-07-12 11:35:00                     4             161746             2025 - 1433
D2025.07.12_S04193_I3027_P-5      2025-07-12 11:35:00                     5             161746             2025 - 1433
D2025.07.12_S04193_I3027_P-6      2025-07-12 11:35:00                     6             161746             2025 - 1433
D2025.07.12_S04194_I3027_P-1      2025-07

In [37]:
## **Cell 5: Silver Layer PatientID Mapping Helper**

def check_silver_patientid_mapping(patientid):
    """Check if a PatientID exists in the silver layer and show its mapping."""
    
    con = duckdb.connect('../database/huntington_data_lake.duckdb')
    
    print(f"Checking PatientID {patientid} in silver layer...")
    
    # Check if PatientID exists in patients table
    patient_info = con.execute("""
        SELECT PatientIDx, PatientID
        FROM silver_embryoscope.patients
        WHERE PatientID = ?
    """, [patientid]).fetchdf()
    
    if len(patient_info) > 0:
        patientidx = patient_info.iloc[0]['PatientIDx']
        # patientname = patient_info.iloc[0]['PatientName']
        
        print(f"✓ Found in silver_embryoscope.patients:")
        print(f"  • PatientID: {patientid}")
        print(f"  • PatientIDx: {patientidx}")
        # print(f"  • PatientName: {patientname}")
        
        # Check how many embryos this patient has in silver layer
        embryo_count = con.execute("""
            SELECT COUNT(*) as embryo_count
            FROM silver_embryoscope.embryo_data
            WHERE PatientIDx = ?
        """, [patientidx]).fetchone()[0]
        
        print(f"  • Embryos in silver layer: {embryo_count}")
        
    else:
        print(f"✗ PatientID {patientid} not found in silver_embryoscope.patients")
    
    con.close()


In [38]:
# Example usage:
check_silver_patientid_mapping(161746)

Checking PatientID 161746 in silver layer...
✓ Found in silver_embryoscope.patients:
  • PatientID: 161746
  • PatientIDx: PC1P7BHG_45850.4034504514
  • Embryos in silver layer: 13


In [15]:

## **Cell 5: Batch PatientID Investigation**
def batch_investigate_patientids(patientid_list, max_samples=5):
    """Investigate multiple PatientIDs in batch."""
    
    print(f"=" * 80)
    print(f"BATCH INVESTIGATION: {len(patientid_list)} PatientIDs")
    print(f"=" * 80)
    
    # Sample a subset for analysis
    sample_patientids = patientid_list[:max_samples]
    
    for i, patientid in enumerate(sample_patientids):
        print(f"\n{'='*60}")
        print(f"PATIENTID {i+1}/{len(sample_patientids)}: {patientid}")
        print(f"{'='*60}")
        
        investigate_specific_patientid(patientid)
        
        if i < len(sample_patientids) - 1:
            print(f"\n" + "="*80)
            input("Press Enter to continue to next PatientID...")


In [None]:

# Example usage:
# batch_investigate_patientids(category1_patientids[:3])  # First 3 Category 1 PatientIDs
# batch_investigate_patientids(category2_patientids[:3])  # First 3 Category 2 PatientIDs

BATCH INVESTIGATION: 3 PatientIDs

PATIENTID 1/3: 753333
DETAILED INVESTIGATION: PatientID 753333

1. EMBRYOSCOPE GOLD LAYER
----------------------------------------
Found 78 records
              embryo_EmbryoID embryo_FertilizationTime  embryo_embryo_number  patient_PatientID treatment_TreatmentName
 D2022.10.28_S02632_I3166_P-1      2022-10-28 10:50:00                     1             753333               1668-2022
 D2022.10.28_S02632_I3166_P-2      2022-10-28 10:50:00                     2             753333               1668-2022
 D2022.10.28_S02632_I3166_P-3      2022-10-28 10:50:00                     3             753333               1668-2022
 D2022.10.28_S02632_I3166_P-4      2022-10-28 10:50:00                     4             753333               1668-2022
 D2022.10.28_S02632_I3166_P-5      2022-10-28 10:50:00                     5             753333               1668-2022
 D2022.10.28_S02632_I3166_P-6      2022-10-28 10:50:00                     6             753333  

In [17]:

## **Cell 6: Quick PatientID Status Check**


def quick_patientid_status(patientid):
    """Quick check of PatientID status across tables."""
    
    con = duckdb.connect(db_path)
    
    # Quick counts
    embryoscope_count = con.execute("""
        SELECT COUNT(*) FROM gold.embryoscope_embrioes WHERE patient_PatientID = ?
    """, [patientid]).fetchone()[0]
    
    clinisys_count = con.execute("""
        SELECT COUNT(*) FROM gold.clinisys_embrioes WHERE micro_prontuario = ?
    """, [patientid]).fetchone()[0]
    
    combined_count = con.execute("""
        SELECT COUNT(*) FROM gold.embryoscope_clinisys_combined WHERE micro_prontuario = ?
    """, [patientid]).fetchone()[0]
    
    print(f"PatientID {patientid} Status:")
    print(f"  • Embryoscope: {embryoscope_count} records")
    print(f"  • Clinisys: {clinisys_count} records")
    print(f"  • Combined: {combined_count} records")
    
    if embryoscope_count > 0 and clinisys_count == 0:
        print(f"  • Category: 2 (Missing from Clinisys)")
    elif embryoscope_count > 0 and clinisys_count > 0 and combined_count == 0:
        print(f"  • Category: 1 (No matches despite both present)")
    elif embryoscope_count > 0 and clinisys_count > 0 and combined_count > 0:
        print(f"  • Status: Partially matched ({combined_count}/{embryoscope_count})")
    else:
        print(f"  • Status: No data found")
    
    con.close()


In [18]:

# Example usage:
quick_patientid_status(161746)
# quick_patientid_status(753333)
# quick_patientid_status(1)

PatientID 161746 Status:
  • Embryoscope: 13 records
  • Clinisys: 9 records
  • Combined: 9 records
  • Status: Partially matched (9/13)
