In [4]:
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [None]:
# Page config
st.set_page_config(page_title="California Asthma & Air Quality Analysis", layout="wide")

# Title and introduction
st.title("California Asthma Emergency Department Visits & Air Quality Analysis")
st.markdown("*Analysis of county-level asthma ED visits and air quality data (2013-2022)*")

# Load data function (replace with your actual data loading)
@st.cache_data
def load_data():
    df = pd.read_csv('processed_data/merged_data_2013-2022.csv')
    return df


In [6]:
# Load data
df = load_data()

# County coefficients from regression (relative to baseline)
county_effects = {
    'Alameda': 0,  # baseline
    'Amador': 5.4041, 'Butte': -11.2520, 'Calaveras': -0.9634, 'Colusa': -10.1819,
    'Contra Costa': 3.4170, 'Del Norte': 6.9385, 'El Dorado': -14.9365,
    'Fresno': 7.7885, 'Glenn': -10.6532, 'Humboldt': 8.2659, 'Imperial': 4.0930,
    'Inyo': 1.7316, 'Kern': -8.7124, 'Kings': 5.8205, 'Lake': 25.5887,
    'Los Angeles': -10.0336, 'Madera': -0.3764, 'Marin': -25.4448, 'Mariposa': -12.1979,
    'Mendocino': 7.5652, 'Merced': 15.0526, 'Mono': -7.4101, 'Monterey': -8.2250,
    'Napa': -13.6074, 'Nevada': -16.3990, 'Orange': -23.1699, 'Placer': -21.0056,
    'Plumas': 0.2246, 'Riverside': -18.2028, 'Sacramento': 6.8682, 'San Benito': 1.3676,
    'San Bernardino': -7.3899, 'San Diego': -22.3589, 'San Francisco': -13.2904,
    'San Joaquin': 6.1894, 'San Luis Obispo': -19.9213, 'San Mateo': -19.0257,
    'Santa Barbara': -18.7643, 'Santa Clara': -22.7562, 'Santa Cruz': -16.6497,
    'Shasta': -4.7428, 'Siskiyou': -9.1020, 'Solano': 21.5477, 'Sonoma': -12.9389,
    'Stanislaus': -0.1550, 'Sutter': -20.0224, 'Tehama': -1.2893, 'Trinity': 0.0039,
    'Tulare': -15.2915, 'Tuolumne': -3.8711, 'Ventura': -20.2164, 'Yolo': -11.5509
}

# Add county risk scores
df['county_risk_score'] = df['county'].map(county_effects)

# Main dashboard layout
col1, col2, col3 = st.columns([1, 1, 1])

with col1:
    st.metric("Counties Analyzed", len(df['county'].unique()))
with col2:
    st.metric("Years of Data", len(df['year'].unique()))
with col3:
    st.metric("Model R-squared", "0.876")

st.markdown("---")

# Key Findings Section
st.header("Key Findings")
col1, col2 = st.columns(2)

with col1:
    st.info("""
    **Geographic Location is a Significant Factor**
    - County location plays a major role in explaining asthma rate variation, alongside air quality and year.
    - Some counties show significantly higher asthma rates than others, such as Lake County, which has much higher rates than the baseline (Alameda).
    - Air quality has a statistically significant, though modest, effect on asthma rates, with higher AQI values associated with slightly higher asthma rates.
     """)

with col2:
    st.warning("""
    **COVID-19 Impact (2020-2021)**
    - There was a significant reduction in asthma-related ED visits during the pandemic, especially in 2020 and 2021.
    - This decline is likely due to reduced healthcare seeking behavior and other pandemic-related factors.
    - The pattern of reduced visits is consistent across many counties, though the extent of the reduction may vary.
    """)

st.markdown("---")

# Interactive Visualizations
st.header("Interactive Analysis")

# Sidebar for filters
st.sidebar.header("🔧 Filters")
selected_years = st.sidebar.multiselect(
    "Select Years",
    options=sorted(df['year'].unique()),
    default=sorted(df['year'].unique()) # default is all years
)

selected_counties = st.sidebar.multiselect(
    "Select Counties (leave empty for all)",
    options=sorted(df['county'].unique()),
    default=[]
)

# Filter data
filtered_df = df[df['year'].isin(selected_years)]
if selected_counties:
    filtered_df = filtered_df[filtered_df['county'].isin(selected_counties)]

# Tab layout for different analyses
tab1, tab2, tab3, tab4 = st.tabs(["County Risk Map", "Time Trends", "Air Quality Relationship", "Regression Results"])

with tab1:
    st.subheader("County Risk Scores (From Regression Model)")
    
    # Create county risk score visualization
    county_risk_df = pd.DataFrame(list(county_effects.items()), columns=['County', 'Risk_Score'])
    county_risk_df['Risk_Category'] = pd.cut(county_risk_df['Risk_Score'], 
                                           bins=[-30, -10, 0, 10, 35], 
                                           labels=['Low Risk', 'Below Average', 'Average', 'High Risk'])
    
    fig_map = px.bar(county_risk_df.sort_values('Risk_Score'), 
                     x='Risk_Score', y='County', 
                     color='Risk_Category',
                     title='County Risk Scores Relative to Baseline (Alameda County)',
                     labels={'Risk_Score': 'Additional ED Visits per 10,000 People'},
                     color_discrete_map={'Low Risk': 'green', 'Below Average': 'lightgreen', 
                                       'Average': 'yellow', 'High Risk': 'red'})
    fig_map.update_layout(height=800)
    st.plotly_chart(fig_map, use_container_width=True)
    
    # Highlight extreme counties
    col1, col2 = st.columns(2)
    with col1:
        st.success("""
        **Lowest Risk Counties:**
        - Santa Clara: -18.2 /no
        - San Diego: -18.4/no
        - Marin: -18.6/no
        """)
    with col2:
        st.error("""
        **Highest Risk Counties:**
        - Lake: +31.8/no
        - Solano: +24.7/no
        - Mendocino: +14.5/no
        """)

with tab2:
    st.subheader("Asthma Rates Over Time")
    
    # Time series plot
    yearly_avg = filtered_df.groupby('year')['asthma_rate'].mean().reset_index()
    fig_time = px.line(yearly_avg, x='year', y='asthma_rate', 
                       title='Average Asthma ED Visit Rate by Year',
                       labels={'asthma_rate': 'ED Visits per 10,000 People', 'year': 'Year'})
    fig_time.add_annotation(x=2020, y=yearly_avg[yearly_avg['year']==2020]['asthma_rate'].iloc[0] if 2020 in yearly_avg['year'].values else 0,
                           text="COVID-19 Impact", showarrow=True, arrowhead=2)
    st.plotly_chart(fig_time, use_container_width=True)
    
    # County comparison over time
    if selected_counties:
        county_time = filtered_df.groupby(['year', 'county'])['asthma_rate'].mean().reset_index()
        fig_county_time = px.line(county_time, x='year', y='asthma_rate', 
                                 color='county', title='Selected Counties: Asthma Rates Over Time')
        st.plotly_chart(fig_county_time, use_container_width=True)

with tab3:
    st.subheader("Air Quality vs Asthma Rates")
    
    # Scatter plot
    aqi_metric = st.selectbox("Select Air Quality Metric", 
                             ['median_aqi', 'max_aqi', 'good_days', 'moderate_days', 'unhealthy_days'])
    
    fig_scatter = px.scatter(filtered_df, x=aqi_metric, y='asthma_rate', 
                            color='county', 
                            title=f'Asthma Rate vs {aqi_metric.replace("_", " ").title()}',
                            trendline="ols")
    st.plotly_chart(fig_scatter, use_container_width=True)
    
    # Correlation analysis
    correlation = filtered_df[aqi_metric].corr(filtered_df['asthma_rate'])
    st.metric(f"Correlation with {aqi_metric}", f"{correlation:.3f}")

with tab4:
    st.subheader("Regression Model Results")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.metric("R-squared", "0.876")
        st.metric("Adjusted R-squared", "0.859")
        st.metric("F-statistic", "52.90")
        st.metric("Observations", "529")
    
    with col2:
        st.metric("Air Quality Coefficient", "0.2017")
        st.metric("Air Quality P-value", "0.001")
        st.metric("Model Significance", "p < 0.001")
        st.metric("Year Effects", "Significant")
    
    st.markdown("""
    ### Model Interpretation:
    - **R-squared = 0.876**: Model explains 87.6% of variation in asthma rates.
    - **County effects**: Geographic location is a significant predictor of asthma rates.
    - **Year effects**: Significant time trends, with especially large reductions in 2020 and 2021 due to the COVID-19 pandemic.
    - **Air quality**: There is a **modest but statistically significant positive relationship** between air quality (median AQI) and asthma rates.
    - **Coefficient interpretation**: Each unit increase in **median AQI** is associated with **0.2017 additional ED visits per 10,000 people**.
    """)

st.markdown("---")

# Data Summary Section
st.header("Data Summary")
st.write("Sample of the data used in this analysis:")
st.dataframe(filtered_df.head(10))

# Download option
csv = filtered_df.to_csv(index=False)
st.download_button(
    label="Download filtered data as CSV",
    data=csv,
    file_name='asthma_air_quality_data.csv',
    mime='text/csv',
)

# Footer
st.markdown("---")
st.markdown("*Dashboard created for California Asthma & Air Quality Analysis Project*")



DeltaGenerator()