# Simple Data Integration

Basic notebook for data integration tasks:
1. Create datasets from DuckDB
2. Parse TypeScript interface from .ts file  
3. Parse JSON data into Python objects
4. Merge datasets
5. Create rank/dose pairs from query results

In [None]:
# Basic imports
import pandas as pd
import numpy as np
import json
import re
from pathlib import Path
from bmd_duckdb_sink import BMDExpressDuckDBSink

print("Imports ready")

## 1. Create Datasets from DuckDB

In [None]:
# Connect to DuckDB and create datasets
sink = BMDExpressDuckDBSink("dehp_analysis.duckdb")
sink.connect()

# Dataset 1: Basic probe data
probe_dataset = sink.execute_query("""
    SELECT 
        p.id as probe_id,
        p.probeSet,
        rg.geneSymbol,
        rg.geneName
    FROM probes p
    LEFT JOIN referenceGeneAnnotations rga ON p.id = rga.probeId
    LEFT JOIN referenceGenes rg ON rga.referenceGeneId = rg.id
    WHERE rg.geneSymbol IS NOT NULL
    LIMIT 50
""")

print(f"Probe dataset: {len(probe_dataset)} records")
probe_dataset.head()

In [None]:
# Dataset 2: Dose information 
dose_dataset = sink.execute_query("""
    SELECT 
        dose,
        COUNT(*) as dose_count,
        AVG(n) as avg_sample_size
    FROM doseGroups 
    GROUP BY dose 
    ORDER BY dose
""")

print(f"Dose dataset: {len(dose_dataset)} records")
dose_dataset

## 2. Parse TypeScript Interface

In [None]:
# Simple TypeScript interface parser
def parse_ts_interface(ts_content):
    """Parse a simple TypeScript interface - you can expand this"""
    
    # Find interface name
    interface_match = re.search(r'interface\s+(\w+)', ts_content)
    interface_name = interface_match.group(1) if interface_match else "Unknown"
    
    # Find properties (simple regex - you can make this more sophisticated)
    properties = []
    prop_pattern = r'(\w+)(\??):\s*([^;\n]+)'
    
    for match in re.finditer(prop_pattern, ts_content):
        prop_name = match.group(1)
        is_optional = match.group(2) == '?'
        prop_type = match.group(3).strip()
        
        properties.append({
            'name': prop_name,
            'type': prop_type,
            'optional': is_optional
        })
    
    return {
        'interface_name': interface_name,
        'properties': properties
    }

# Example TypeScript content (replace with your actual .ts file content)
sample_ts = """
interface ProbeResult {
    id: number;
    probeSet: string;
    geneSymbol?: string;
    bmdValue: number;
    rank: number;
}
"""

# Parse the interface
ts_schema = parse_ts_interface(sample_ts)
print(f"Parsed interface: {ts_schema['interface_name']}")
print("Properties:")
for prop in ts_schema['properties']:
    optional = "?" if prop['optional'] else ""
    print(f"  {prop['name']}{optional}: {prop['type']}")

# TODO: Replace sample_ts with actual file content
# with open('your_file.ts', 'r') as f:
#     ts_content = f.read()
#     ts_schema = parse_ts_interface(ts_content)

## 3. Parse JSON to Python Objects

In [None]:
# Sample JSON data (replace with your actual JSON)
sample_json = {
    "results": [
        {"id": 1, "probeSet": "Probe_001", "geneSymbol": "GENE1", "bmdValue": 5.2, "rank": 1},
        {"id": 2, "probeSet": "Probe_002", "geneSymbol": "GENE2", "bmdValue": 8.7, "rank": 2},
        {"id": 3, "probeSet": "Probe_003", "geneSymbol": "GENE3", "bmdValue": 12.1, "rank": 3}
    ]
}

# Convert JSON to Python objects/DataFrame
def json_to_dataframe(json_data, results_key="results"):
    """Convert JSON data to DataFrame"""
    
    if results_key in json_data:
        data = json_data[results_key]
    elif isinstance(json_data, list):
        data = json_data
    else:
        data = [json_data]
    
    return pd.DataFrame(data)

# Create DataFrame from JSON
json_dataset = json_to_dataframe(sample_json)

print(f"JSON dataset: {len(json_dataset)} records")
json_dataset

In [None]:
# TODO: Load from actual JSON file
# with open('your_data.json', 'r') as f:
#     json_data = json.load(f)
#     json_dataset = json_to_dataframe(json_data)

print("JSON parsing ready - replace sample data with your files")

## 4. Merge Datasets

In [None]:
# Merge the datasets
# Example: merge probe data with JSON data on probe info

# First, let's see what columns we have
print("Probe dataset columns:", probe_dataset.columns.tolist())
print("JSON dataset columns:", json_dataset.columns.tolist())

# Simple merge example - you'll customize this
# Let's create a common key for merging
merged_dataset = pd.merge(
    probe_dataset, 
    json_dataset, 
    left_on='geneSymbol',  # Adjust column names as needed
    right_on='geneSymbol',
    how='inner',  # or 'left', 'right', 'outer'
    suffixes=('_db', '_json')
)

print(f"Merged dataset: {len(merged_dataset)} records")
merged_dataset.head()

In [None]:
# Alternative merge: Add dose information
# Create a comprehensive dataset by adding dose info

# Add max dose as a reference point
max_dose = dose_dataset['dose'].max()
merged_dataset['max_dose'] = max_dose

print(f"Added dose information. Max dose: {max_dose}")
print(f"Final merged dataset shape: {merged_dataset.shape}")
merged_dataset.columns.tolist()

## 5. Create Rank/Dose Pairs

In [None]:
# Create rank/dose pairs from query results
def create_rank_dose_pairs(data, value_column='bmdValue', dose_column='max_dose'):
    """Create rank/dose pairs from data"""
    
    # Make a copy to avoid modifying original
    rank_data = data.copy()
    
    # Create ranks (1 = best/lowest value)
    rank_data['rank'] = rank_data[value_column].rank(method='min')
    rank_data['percentile_rank'] = rank_data[value_column].rank(pct=True)
    
    # Create rank/dose pairs
    pairs = rank_data[[
        'probe_id', 'geneSymbol', 'rank', 'percentile_rank', 
        value_column, dose_column
    ]].copy()
    
    return pairs

# Create the rank/dose pairs
rank_dose_pairs = create_rank_dose_pairs(merged_dataset)

print(f"Created {len(rank_dose_pairs)} rank/dose pairs")
print("\nTop 10 ranked items:")
rank_dose_pairs.head(10)

In [None]:
# Alternative: Custom query for rank/dose pairs
# Create rank/dose pairs directly from database query

custom_rank_query = sink.execute_query("""
    SELECT 
        p.id as probe_id,
        rg.geneSymbol,
        RANK() OVER (ORDER BY p.id) as custom_rank,
        1000.0 as reference_dose,
        p.id * 1.5 as synthetic_value
    FROM probes p
    LEFT JOIN referenceGeneAnnotations rga ON p.id = rga.probeId
    LEFT JOIN referenceGenes rg ON rga.referenceGeneId = rg.id
    WHERE rg.geneSymbol IS NOT NULL
    ORDER BY custom_rank
    LIMIT 20
""")

print(f"Custom rank/dose query: {len(custom_rank_query)} results")
custom_rank_query.head()

## Summary

In [None]:
# Summary of what we created
print("=== SUMMARY ===")
print(f"1. DuckDB datasets created: probe_dataset ({len(probe_dataset)} rows), dose_dataset ({len(dose_dataset)} rows)")
print(f"2. TypeScript interface parsed: {ts_schema['interface_name']} with {len(ts_schema['properties'])} properties")
print(f"3. JSON data parsed: {len(json_dataset)} records")
print(f"4. Merged dataset: {len(merged_dataset)} records")
print(f"5. Rank/dose pairs: {len(rank_dose_pairs)} pairs")

print("\n=== AVAILABLE VARIABLES ===")
print("- probe_dataset: Basic probe data from DuckDB")
print("- dose_dataset: Dose information from DuckDB") 
print("- json_dataset: Parsed JSON data")
print("- merged_dataset: Combined data")
print("- rank_dose_pairs: Analysis-ready rank/dose pairs")
print("- sink: DuckDB connection for custom queries")

print("\n=== NEXT STEPS ===")
print("1. Replace sample TypeScript content with your .ts file")
print("2. Replace sample JSON with your actual JSON file")
print("3. Customize merge logic for your specific needs")
print("4. Modify rank/dose pair creation as needed")

In [None]:
# Cleanup
sink.disconnect()
print("Database connection closed")