# Exploration of Combined Embryos Table

This notebook explores the combined `embryoscope_clinisys_combined` table that joins embryoscope and clinisys embryo data.

## Overview
- **Table**: `embryoscope_clinisys_combined`
- **Source**: Combined from embryoscope and clinisys gold layers
- **Join Keys**: Date and embryo number
- **Database**: `huntington_data_lake.duckdb`

In [None]:
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print(f"Exploration started at: {datetime.now()}")

## Database Connection

In [None]:
# Connect to the database
db_path = "../database/huntington_data_lake.duckdb"
conn = duckdb.connect(db_path)

print(f"Connected to database: {db_path}")

## Table Overview

In [None]:
# Check if the combined table exists
tables = conn.execute("SHOW TABLES").fetchall()
print("Available tables:")
for table in tables:
    print(f"  - {table[0]}")

print("\n" + "="*50)

# Check the combined table specifically
combined_table = "gold.embryoscope_clinisys_combined"
if any(table[0] == combined_table for table in tables):
    print(f"✓ {combined_table} table found")
else:
    print(f"✗ {combined_table} table not found")
    print("Available tables with 'embryo' in name:")
    for table in tables:
        if 'embryo' in table[0].lower():
            print(f"  - {table[0]}")

## Basic Table Information

In [None]:
# Get table schema
schema = conn.execute(f"DESCRIBE {combined_table}").fetchall()
print(f"Table Schema for {combined_table}:")
print("="*80)
for col in schema:
    print(f"{col[0]:<30} {col[1]:<20} {col[2]}")

print(f"\nTotal columns: {len(schema)}")

In [None]:
# Get row count
row_count = conn.execute(f"SELECT COUNT(*) FROM {combined_table}").fetchone()[0]
print(f"Total rows: {row_count:,}")

# Get sample data
sample = conn.execute(f"SELECT * FROM {combined_table} LIMIT 5").fetchdf()
print("\nSample data:")
display(sample)

## Data Quality Analysis

In [None]:
# Check for null values
null_counts = {}
for col in schema:
    col_name = col[0]
    null_count = conn.execute(f"SELECT COUNT(*) FROM {combined_table} WHERE {col_name} IS NULL").fetchone()[0]
    null_counts[col_name] = null_count

# Convert to DataFrame for better display
null_df = pd.DataFrame(list(null_counts.items()), columns=['Column', 'Null_Count'])
null_df['Null_Percentage'] = (null_df['Null_Count'] / row_count * 100).round(2)
null_df = null_df.sort_values('Null_Count', ascending=False)

print("Columns with null values:")
display(null_df[null_df['Null_Count'] > 0].head(20))

# Plot null percentages for top columns
top_null_cols = null_df[null_df['Null_Count'] > 0].head(15)
if len(top_null_cols) > 0:
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(top_null_cols)), top_null_cols['Null_Percentage'])
    plt.yticks(range(len(top_null_cols)), top_null_cols['Column'])
    plt.xlabel('Null Percentage (%)')
    plt.title('Top 15 Columns with Highest Null Percentages')
    plt.tight_layout()
    plt.show()

## Source System Analysis

In [None]:
# Analyze data by source system
source_analysis = conn.execute(f"""
    SELECT 
        CASE 
            WHEN embryo_EmbryoID IS NOT NULL THEN 'Embryoscope'
            ELSE 'Clinisys Only'
        END as source_system,
        COUNT(*) as record_count,
        COUNT(DISTINCT oocito_id) as unique_clinisys_oocitos,
        COUNT(DISTINCT embryo_EmbryoID) as unique_embryoscope_embryos
    FROM {combined_table}
    GROUP BY 
        CASE 
            WHEN embryo_EmbryoID IS NOT NULL THEN 'Embryoscope'
            ELSE 'Clinisys Only'
        END
""").fetchdf()

print("Data distribution by source system:")
display(source_analysis)

# Visualize the distribution
plt.figure(figsize=(10, 6))
plt.pie(source_analysis['record_count'], labels=source_analysis['source_system'], autopct='%1.1f%%')
plt.title('Record Distribution by Source System')
plt.show()

## Date Range Analysis

In [None]:
# Analyze date ranges
date_analysis = conn.execute(f"""
    SELECT 
        MIN(micro_Data_DL) as min_clinisys_date,
        MAX(micro_Data_DL) as max_clinisys_date,
        MIN(embryo_FertilizationTime) as min_embryoscope_date,
        MAX(embryo_FertilizationTime) as max_embryoscope_date,
        COUNT(DISTINCT micro_Data_DL) as unique_clinisys_dates,
        COUNT(DISTINCT embryo_FertilizationTime) as unique_embryoscope_dates
    FROM {combined_table}
""").fetchdf()

print("Date range analysis:")
display(date_analysis)

# Monthly distribution
monthly_dist = conn.execute(f"""
    SELECT 
        strftime('%Y-%m', micro_Data_DL) as month,
        COUNT(*) as record_count
    FROM {combined_table}
    WHERE micro_Data_DL IS NOT NULL
    GROUP BY strftime('%Y-%m', micro_Data_DL)
    ORDER BY month
    LIMIT 20
""").fetchdf()

if len(monthly_dist) > 0:
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(monthly_dist)), monthly_dist['record_count'], marker='o')
    plt.xticks(range(len(monthly_dist)), monthly_dist['month'], rotation=45)
    plt.xlabel('Month')
    plt.ylabel('Record Count')
    plt.title('Monthly Record Distribution (Clinisys)')
    plt.tight_layout()
    plt.show()

## Embryo Number Analysis

In [None]:
# Analyze embryo numbers
embryo_analysis = conn.execute(f"""
    SELECT 
        oocito_embryo_number,
        embryo_embryo_number,
        COUNT(*) as match_count
    FROM {combined_table}
    WHERE oocito_embryo_number IS NOT NULL AND embryo_embryo_number IS NOT NULL
    GROUP BY oocito_embryo_number, embryo_embryo_number
    ORDER BY match_count DESC
    LIMIT 10
""").fetchdf()

print("Top embryo number matches:")
display(embryo_analysis)

# Distribution of embryo numbers
embryo_dist = conn.execute(f"""
    SELECT 
        oocito_embryo_number,
        COUNT(*) as count
    FROM {combined_table}
    WHERE oocito_embryo_number IS NOT NULL
    GROUP BY oocito_embryo_number
    ORDER BY count DESC
    LIMIT 20
""").fetchdf()

if len(embryo_dist) > 0:
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(embryo_dist)), embryo_dist['count'])
    plt.xlabel('Embryo Number')
    plt.ylabel('Count')
    plt.title('Top 20 Embryo Numbers by Frequency')
    plt.xticks(range(len(embryo_dist)), embryo_dist['oocito_embryo_number'], rotation=45)
    plt.tight_layout()
    plt.show()

## Key Metrics Summary

In [None]:
# Summary statistics
summary = conn.execute(f"""
    SELECT 
        COUNT(*) as total_records,
        COUNT(DISTINCT oocito_id) as unique_clinisys_oocitos,
        COUNT(DISTINCT embryo_EmbryoID) as unique_embryoscope_embryos,
        COUNT(DISTINCT oocito_embryo_number) as unique_clinisys_embryos,
        COUNT(DISTINCT embryo_embryo_number) as unique_embryoscope_embryo_numbers,
        COUNT(CASE WHEN embryo_EmbryoID IS NOT NULL THEN 1 END) as matched_records,
        ROUND(COUNT(CASE WHEN embryo_EmbryoID IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2) as match_percentage
    FROM {combined_table}
""").fetchdf()

print("Key Metrics Summary:")
print("="*50)
for col in summary.columns:
    value = summary[col].iloc[0]
    if 'percentage' in col.lower():
        print(f"{col.replace('_', ' ').title()}: {value}%")
    else:
        print(f"{col.replace('_', ' ').title()}: {value:,}")

## Data Quality Issues

In [None]:
# Check for potential data quality issues
issues = conn.execute(f"""
    SELECT 
        'Records with clinisys data but no embryoscope match' as issue_type,
        COUNT(*) as count
    FROM {combined_table}
    WHERE oocito_id IS NOT NULL AND embryo_EmbryoID IS NULL
    
    UNION ALL
    
    SELECT 
        'Records with mismatched embryo numbers' as issue_type,
        COUNT(*) as count
    FROM {combined_table}
    WHERE oocito_embryo_number IS NOT NULL 
        AND embryo_embryo_number IS NOT NULL
        AND oocito_embryo_number != embryo_embryo_number
    
    UNION ALL
    
    SELECT 
        'Records with date mismatches (>1 day difference)' as issue_type,
        COUNT(*) as count
    FROM {combined_table}
    WHERE micro_Data_DL IS NOT NULL 
        AND embryo_FertilizationTime IS NOT NULL
        AND ABS(JULIANDAY(micro_Data_DL) - JULIANDAY(embryo_FertilizationTime)) > 1
""").fetchdf()

print("Potential Data Quality Issues:")
display(issues)

## Recommendations

In [None]:
print("""
## Recommendations for Data Quality Improvement:

1. **Low Match Rate**: Only {:.1f}% of records have matches between systems
   - Investigate why so many clinisys records don't have embryoscope counterparts
   - Check if date ranges overlap between systems
   - Verify embryo number formats are consistent

2. **Data Completeness**: 
   - Focus on columns with high null percentages
   - Consider data validation rules for critical fields

3. **Date Alignment**:
   - Ensure both systems use the same date format and timezone
   - Consider if date matching criteria should be more flexible

4. **Embryo Number Consistency**:
   - Verify embryo number formats and ranges in both systems
   - Check for leading zeros or formatting differences

5. **Monitoring**:
   - Set up regular monitoring of match rates
   - Track data quality metrics over time
""".format(summary['match_percentage'].iloc[0]))

In [None]:
# Close database connection
conn.close()
print(f"Exploration completed at: {datetime.now()}")
print("Database connection closed.")