# Example 3: Geospatial Disease Patterns

This notebook demonstrates SPOKE-OKN capabilities:
1. Disease prevalence by US state/county
2. Social determinants of health (SDoH) associations
3. Environmental factors correlated with disease
4. Geographic visualization of health patterns

In [None]:
from mcp_space_life_sciences import IntegratedKGClient
import matplotlib.pyplot as plt
import pandas as pd

client = IntegratedKGClient()

## Step 1: Query Disease Prevalence by State

In [None]:
# Get cardiovascular disease prevalence across US states
cvd_prevalence = client.get_disease_prevalence_by_location(
    disease_names=["cardiovascular disease", "heart disease"],
    location_type="state"
)

# Convert to DataFrame for easy analysis
df_prevalence = pd.DataFrame(cvd_prevalence)
print(df_prevalence.head())

# Find states with highest prevalence
top_states = df_prevalence.nlargest(10, 'prevalence')
print("\nTop 10 states by CVD prevalence:")
print(top_states[['state', 'prevalence', 'year']])

## Step 2: Explore Social Determinants of Health

In [None]:
# Get SDoH factors in high-prevalence states
high_prev_states = top_states['state'].tolist()[:5]

sdoh_data = []
for state in high_prev_states:
    sdoh = client.get_sdoh_by_location(
        location=state,
        domains=["Economic Stability", "Education", "Healthcare Access"]
    )
    sdoh_data.extend(sdoh)

df_sdoh = pd.DataFrame(sdoh_data)
print("\nSDoH factors in high CVD states:")
print(df_sdoh.head(10))

## Step 3: Find SDoH-Disease Associations

In [None]:
# Find which SDoH factors are associated with cardiovascular disease
sdoh_disease_assoc = client.find_sdoh_disease_associations(
    sdoh_concepts=["poverty", "food insecurity", "education level"],
    diseases=["cardiovascular disease"],
    p_value_threshold=0.05
)

df_assoc = pd.DataFrame(sdoh_disease_assoc)
print("\nStatistically significant SDoH-CVD associations:")
print(df_assoc.sort_values('enrichment', ascending=False))

## Step 4: Analyze Environmental Exposures

In [None]:
# Get chemical exposures in high-CVD states
exposures = []
for state in high_prev_states:
    chemicals = client.get_chemical_exposures_by_location(
        location=state,
        media="air"  # air pollution
    )
    exposures.extend(chemicals)

df_exposures = pd.DataFrame(exposures)
print("\nEnvironmental chemical exposures:")
print(df_exposures.head(10))

## Step 5: Visualize Geographic Patterns

In [None]:
# Create choropleth map of disease prevalence
fig = client.create_geospatial_disease_map(
    disease_name="cardiovascular disease",
    metric="prevalence"
)

plt.title("Cardiovascular Disease Prevalence by State")
plt.show()

## Step 6: Correlate with Biological Data

In [None]:
# Get genes associated with CVD from PrimeKG
cvd_genes = client.find_disease_genes(
    disease_name="cardiovascular disease",
    source="primekg"
)

# Check if any space biology experiments show changes in these genes
space_overlap = client.find_genes_in_genelab(
    gene_names=cvd_genes[:50]
)

print(f"\nFound {len(space_overlap)} CVD genes also studied in space biology")
print(f"Examples: {space_overlap[:10]}")

## Summary

This notebook demonstrated:
- ✅ Querying disease prevalence by geographic location (SPOKE-OKN)
- ✅ Analyzing social determinants of health
- ✅ Identifying environmental exposures
- ✅ Finding SDoH-disease associations
- ✅ Visualizing geographic health patterns
- ✅ Connecting geospatial data to biological mechanisms