# Number 1 Analysis: Exactly how the AQI improved / decreased? Did it improve in specific months, weeks, or days of the week, or hours of a day?

## SKIP THIS SECTION IF: you already have  file: "fresno_pm25_hourly_with_aqi_and_category.csv"

🧩 코드: 30일 단위 요청 + 분할 저장 버전

In [12]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta

# Constants
email = "munkh-erdene.khuderbaatar@sjsu.edu"
api_key = "bluebird66"
param_code = "88101"       # PM2.5
state_code = "06"          # California
county_code = "019"        # Fresno
dataset = "sampleData"     # <-- Important
start_year = 1999
end_year = 2025

# Loop over each month × year
for year in range(start_year, end_year):
    for month in range(1, 13):
        # Split by 30 days for better management of data
        start_date = datetime(year, month, 1)
        end_date = (start_date + pd.offsets.MonthEnd(0)).date()
        bdate = start_date.strftime("%Y%m%d")
        edate = end_date.strftime("%Y%m%d")

        print(f"📡 Fetching {bdate} to {edate}...")

        # API URL
        url = (
            f"https://aqs.epa.gov/data/api/{dataset}/byCounty"
            f"?email={email}&key={api_key}&param={param_code}"
            f"&bdate={bdate}&edate={edate}"
            f"&state={state_code}&county={county_code}"
        )

        try:
            r = requests.get(url, )#timeout=120)
            data = r.json()

            if isinstance(data, dict) and data.get('Header', [{}])[0].get('status') == 'Success':
                df = pd.json_normalize(data.get('Data', []))
                if not df.empty:
                    # Save per month
                    fname = f"fresno_pm25_{year}_{month:02}.csv"
                    df.to_csv(fname, index=False)
                    print(f"✅ Saved: {fname}")
                else:
                    print(f"⚠️ No data for {year}-{month:02}")
            else:
                print(f"⚠️ Failed response for {year}-{month:02}")

        except Exception as e:
            print(f"❌ Error for {year}-{month:02}: {e}")

        time.sleep(5)  # avoid rate limit


📡 Fetching 19990101 to 19990131...
✅ Saved: fresno_pm25_1999_01.csv
📡 Fetching 19990201 to 19990228...
✅ Saved: fresno_pm25_1999_02.csv
📡 Fetching 19990301 to 19990331...
✅ Saved: fresno_pm25_1999_03.csv
📡 Fetching 19990401 to 19990430...
✅ Saved: fresno_pm25_1999_04.csv
📡 Fetching 19990501 to 19990531...
✅ Saved: fresno_pm25_1999_05.csv
📡 Fetching 19990601 to 19990630...
✅ Saved: fresno_pm25_1999_06.csv
📡 Fetching 19990701 to 19990731...
✅ Saved: fresno_pm25_1999_07.csv
📡 Fetching 19990801 to 19990831...
✅ Saved: fresno_pm25_1999_08.csv
📡 Fetching 19990901 to 19990930...
✅ Saved: fresno_pm25_1999_09.csv
📡 Fetching 19991001 to 19991031...
✅ Saved: fresno_pm25_1999_10.csv
📡 Fetching 19991101 to 19991130...
✅ Saved: fresno_pm25_1999_11.csv
📡 Fetching 19991201 to 19991231...
✅ Saved: fresno_pm25_1999_12.csv
📡 Fetching 20000101 to 20000131...
✅ Saved: fresno_pm25_2000_01.csv
📡 Fetching 20000201 to 20000229...
✅ Saved: fresno_pm25_2000_02.csv
📡 Fetching 20000301 to 20000331...
✅ Saved: fres

🧩 병합 코드: 월별 PM2.5 CSV → 하나로 통합

In [13]:
import pandas as pd
import glob

# 경로에 따라 수정 가능
file_list = glob.glob("fresno_pm25_*.csv")  # 모든 월별 파일 검색
print(f"📂 Found {len(file_list)} files.")

# 모든 파일 읽어서 하나의 DataFrame으로 연결
df_list = [pd.read_csv(fname) for fname in file_list]
combined_df = pd.concat(df_list, ignore_index=True)

# 문자열 → 날짜형 변환
combined_df["date_local"] = pd.to_datetime(combined_df["date_local"])
combined_df["time_local"] = pd.to_datetime(combined_df["time_local"], format='%H:%M', errors='coerce').dt.time

# 저장 (선택)
combined_df.to_csv("fresno_pm25_hourly_merged.csv", index=False)
print("✅ Saved merged file as fresno_pm25_hourly_merged.csv")

# 미리 보기
combined_df.head()


📂 Found 312 files.
✅ Saved merged file as fresno_pm25_hourly_merged.csv


Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,date_local,time_local,date_gmt,time_gmt,sample_measurement,units_of_measure,units_of_measure_code,sample_duration,sample_duration_code,sample_frequency,detection_limit,uncertainty,qualifier,method_type,method,method_code,state,county,date_of_last_change,cbsa_code
0,6,19,8,88101,1,36.781333,-119.77319,WGS84,PM2.5 - Local Conditions,2007-11-01,00:00:00,2007-11-01,08:00,41.0,Micrograms/cubic meter (LC),105,24 HOUR,7,EVERY DAY,2.0,,,FRM,R & P Model 2025 PM2.5 Sequential w/WINS - GRAVIMETRIC,118,California,Fresno,,23420
1,6,19,8,88101,1,36.781333,-119.77319,WGS84,PM2.5 - Local Conditions,2007-11-02,00:00:00,2007-11-02,08:00,40.8,Micrograms/cubic meter (LC),105,24 HOUR,7,EVERY DAY,2.0,,,FRM,R & P Model 2025 PM2.5 Sequential w/WINS - GRAVIMETRIC,118,California,Fresno,,23420
2,6,19,8,88101,1,36.781333,-119.77319,WGS84,PM2.5 - Local Conditions,2007-11-03,00:00:00,2007-11-03,08:00,47.0,Micrograms/cubic meter (LC),105,24 HOUR,7,EVERY DAY,2.0,,,FRM,R & P Model 2025 PM2.5 Sequential w/WINS - GRAVIMETRIC,118,California,Fresno,,23420
3,6,19,8,88101,1,36.781333,-119.77319,WGS84,PM2.5 - Local Conditions,2007-11-04,00:00:00,2007-11-04,08:00,40.0,Micrograms/cubic meter (LC),105,24 HOUR,7,EVERY DAY,2.0,,,FRM,R & P Model 2025 PM2.5 Sequential w/WINS - GRAVIMETRIC,118,California,Fresno,,23420
4,6,19,8,88101,1,36.781333,-119.77319,WGS84,PM2.5 - Local Conditions,2007-11-05,00:00:00,2007-11-05,08:00,31.3,Micrograms/cubic meter (LC),105,24 HOUR,7,EVERY DAY,2.0,,,FRM,R & P Model 2025 PM2.5 Sequential w/WINS - GRAVIMETRIC,118,California,Fresno,,23420


✅ Step 1: PM2.5 → AQI 변환 함수 정의

In [14]:
def pm25_to_aqi(pm25):
    """
    Converts PM2.5 concentration (µg/m³) to AQI based on EPA breakpoints.
    Source: https://www.airnow.gov/aqi/aqi-calculation/
    """
    breakpoints = [
        (0.0, 12.0, 0, 50),
        (12.1, 35.4, 51, 100),
        (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300),
        (250.5, 350.4, 301, 400),
        (350.5, 500.4, 401, 500)
    ]

    for c_low, c_high, i_low, i_high in breakpoints:
        if c_low <= pm25 <= c_high:
            aqi = ((i_high - i_low) / (c_high - c_low)) * (pm25 - c_low) + i_low
            return round(aqi)
    return None  # out of range (e.g., negative value)


✅ Step 2: 병합된 DataFrame에 AQI 값 추가

In [15]:
# 병합된 데이터 로딩 (필요 시)
df = pd.read_csv("fresno_pm25_hourly_merged.csv")

# sample_measurement → AQI
df['aqi'] = df['sample_measurement'].apply(pm25_to_aqi)

# 저장
df.to_csv("fresno_pm25_hourly_with_aqi.csv", index=False)
print("✅ Added 'aqi' column and saved to fresno_pm25_hourly_with_aqi.csv")

# 미리 보기
df[['date_local', 'time_local', 'sample_measurement', 'aqi']].head()


✅ Added 'aqi' column and saved to fresno_pm25_hourly_with_aqi.csv


Unnamed: 0,date_local,time_local,sample_measurement,aqi
0,2007-11-01,00:00:00,41.0,115.0
1,2007-11-02,00:00:00,40.8,114.0
2,2007-11-03,00:00:00,47.0,129.0
3,2007-11-04,00:00:00,40.0,112.0
4,2007-11-05,00:00:00,31.3,91.0


🧠 이게 왜 유용하냐면?
용도	설명
✅ 히트맵 시각화	AQI 색상 구분 가능
✅ 일반인 이해도 향상	숫자 대신 범주로 보여주면 훨씬 쉽게 전달 가능
✅ 주중 vs 주말, 시간대별 범주 비율 분석	"언제 위험했는지" 설명하기 좋음

In [16]:
def categorize_aqi(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    elif aqi <= 500:
        return "Hazardous"
    else:
        return "Unknown"


In [None]:
df['aqi_category'] = df['aqi'].apply(categorize_aqi)

# 필요한 컬럼만 유지
columns_to_keep = [
    'date_local', 'time_local',
    'latitude', 'longitude',
    'sample_measurement',
    'aqi', 'aqi_category','site_number'
]
df = df[columns_to_keep]

# 저장
df.to_csv("fresno_pm25_hourly_with_aqi_and_category.csv", index=False)
print("✅ Saved with AQI categories as fresno_pm25_hourly_with_aqi_and_category.csv")


✅ Saved with AQI categories as fresno_pm25_hourly_with_aqi_and_category.csv


## #1 Visualization
✅ 1. Streamlit 슬라이더 버전 (streamlit_app.py)

In [19]:
pip install streamlit streamlit-folium pandas folium

Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting streamlit-folium
  Downloading streamlit_folium-0.25.0-py3-none-any.whl.metadata (621 bytes)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit_folium-0.25.0-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.4/328.4 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━

In [20]:
# streamlit_app.py

import streamlit as st
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from streamlit_folium import folium_static
from datetime import datetime

# AQI 색상 매핑 (카테고리 기준)
category_colors = {
    "Good": "#00E400", "Moderate": "#FFFF00",
    "Unhealthy for Sensitive Groups": "#FF7E00",
    "Unhealthy": "#FF0000", "Very Unhealthy": "#8F3F97",
    "Hazardous": "#7E0023", "Unknown": "gray"
}

# --- 데이터 로드 ---
@st.cache_data
def load_data():
    df = pd.read_csv("fresno_pm25_hourly_with_aqi_and_category.csv")
    df["datetime"] = pd.to_datetime(df["date_local"] + " " + df["time_local"], errors="coerce")
    return df.dropna(subset=["datetime", "latitude", "longitude"])

df = load_data()

# --- UI ---
st.set_page_config(layout="wide")
st.title("📍 Fresno PM2.5 AQI - Time Slider Map")
st.caption("Explore AQI by hour from 1999 to 2025")

# 날짜 슬라이더
min_date = df["datetime"].dt.date.min()
max_date = df["datetime"].dt.date.max()
selected_date = st.slider("📅 Date", min_value=min_date, max_value=max_date, value=datetime(2017, 6, 15).date())

# 시간 슬라이더
selected_hour = st.slider("🕒 Hour", 0, 23, 14)

# 필터링
filtered = df[
    (df["datetime"].dt.date == selected_date) &
    (df["datetime"].dt.hour == selected_hour)
]

# 지도 시각화
st.markdown(f"🧭 {len(filtered)} monitors found for {selected_date} @ {selected_hour}:00")
m = folium.Map(location=[filtered["latitude"].mean(), filtered["longitude"].mean()],
               zoom_start=10, tiles="CartoDB dark_matter")
cluster = MarkerCluster().add_to(m)

for _, row in filtered.iterrows():
    color = category_colors.get(row["aqi_category"], "gray")
    popup = f"Site: {row['site_number']}<br>AQI: {row['aqi']}<br>PM2.5: {row['sample_measurement']} µg/m³"
    folium.CircleMarker(
        location=(row["latitude"], row["longitude"]),
        radius=7,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.9,
        popup=popup
    ).add_to(cluster)

folium_static(m)

2025-05-07 04:31:08.789 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
folium_static is deprecated and will be removed in a future release, or
simply replaced with with st_folium which always passes
returned_objects=[] to the component.
Please try using st_folium instead, and
post an issue at https://github.com/randyzwitch/streamlit-folium/issues
if you experience issues with st_folium.

  folium_static(m)


DeltaGenerator()

In [23]:
# streamlit run streamlit_app.py

## ✅ 2. Dash 버전 (dash_app.py)

In [None]:
pip install dash pandas plotly

In [None]:
# dash_app.py

import dash
from dash import dcc, html, Input, Output
import pandas as pd
import plotly.express as px

# Load data
df = pd.read_csv("fresno_pm25_hourly_with_aqi_and_category.csv")
df["datetime"] = pd.to_datetime(df["date_local"] + " " + df["time_local"], errors="coerce")

# App setup
app = dash.Dash(__name__)
app.title = "Fresno AQI Monitor Dashboard"

app.layout = html.Div([
    html.H2("📍 Fresno PM2.5 AQI - Interactive Dashboard"),

    html.Div([
        html.Label("📅 Select Date"),
        dcc.DatePickerSingle(
            id="date-picker",
            min_date_allowed=df["datetime"].min().date(),
            max_date_allowed=df["datetime"].max().date(),
            date=px.utils.to_date_string("2017-06-15")
        ),
        html.Label("🕒 Select Hour"),
        dcc.Slider(id="hour-slider", min=0, max=23, step=1, value=14,
                   marks={i: f"{i}:00" for i in range(0, 24, 2)})
    ], style={"width": "48%", "display": "inline-block"}),

    dcc.Graph(id="map-graph")
])

@app.callback(
    Output("map-graph", "figure"),
    Input("date-picker", "date"),
    Input("hour-slider", "value")
)
def update_map(date_str, hour):
    date = pd.to_datetime(date_str).date()
    filtered = df[(df["datetime"].dt.date == date) & (df["datetime"].dt.hour == hour)]

    fig = px.scatter_mapbox(
        filtered,
        lat="latitude",
        lon="longitude",
        color="aqi_category",
        color_discrete_map={
            "Good": "#00E400", "Moderate": "#FFFF00", "Unhealthy for Sensitive Groups": "#FF7E00",
            "Unhealthy": "#FF0000", "Very Unhealthy": "#8F3F97", "Hazardous": "#7E0023"
        },
        hover_data=["site_number", "aqi", "sample_measurement"],
        zoom=9,
        height=600
    )
    fig.update_layout(mapbox_style="carto-darkmatter")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    return fig

if __name__ == "__main__":
    app.run_server(debug=True)


In [None]:
# python dash_app.py

# Number 4 Analysis: What counties had the most effective AQI improvement over time? Show visualization (AQI linear plot) over time for each monitor

Code to get 1999-2024 PM 2.5 Data for the entire SJV

In [4]:
import requests
import pandas as pd
import time

# EPA API info
email = "munkh-erdene.khuderbaatar@sjsu.edu"
api_key = "bluebird66"
ca_code = "06"
pm25 = "88101"

# Counties in SJV
county_codes = {
    "Fresno": "019",
    "Kern": "029",
    "Kings": "031",
    "Madera": "039",
    "Merced": "047",
    "San Joaquin": "077",
    "Stanislaus": "099",
    "Tulare": "107"
}

# Set Year
start_year = 1999
end_year = 2025

# initialize empty dataframe
all_pm25_df = pd.DataFrame()

# looping: County X Year
for county_name, county_code in county_codes.items():
    for year in range(start_year, end_year):
        print(f"📡 Fetching {county_name} - {year}...")

        url = (
            f"https://aqs.epa.gov/data/api/dailyData/byCounty?"
            f"email={email}&key={api_key}&param={pm25}&"
            f"bdate={year}0101&edate={year}1231&state={ca_code}&county={county_code}"
        )

        try:
            response = requests.get(url)
            data = response.json()
            if data.get('Header', [])[0]['status'] == "Success":
                current_df = pd.json_normalize(data.get('Data', []))
                current_df["county_name"] = county_name
                all_pm25_df = pd.concat([all_pm25_df, current_df], ignore_index=True)
                print(f"✅ Added data for {county_name} {year}")
            else:
                print(f"⚠️ No data for {county_name} {year}")

        except requests.exceptions.RequestException as e:
            print(f"❌ Request failed for {county_name} {year}: {e}")

        # sleep to prevent too fast requests and server fetching errors
        time.sleep(6)

final_df = all_pm25_df

# Save
final_df.to_csv('pm25_sjv_combined.csv', index=False)
print("🎉 All data saved to pm25_sjv_combined.csv")

📡 Fetching Fresno - 1999...
✅ Added data for Fresno 1999
📡 Fetching Fresno - 2000...
✅ Added data for Fresno 2000
📡 Fetching Fresno - 2001...
✅ Added data for Fresno 2001
📡 Fetching Fresno - 2002...
✅ Added data for Fresno 2002
📡 Fetching Fresno - 2003...
✅ Added data for Fresno 2003
📡 Fetching Fresno - 2004...
✅ Added data for Fresno 2004
📡 Fetching Fresno - 2005...
✅ Added data for Fresno 2005
📡 Fetching Fresno - 2006...
✅ Added data for Fresno 2006
📡 Fetching Fresno - 2007...
✅ Added data for Fresno 2007
📡 Fetching Fresno - 2008...
✅ Added data for Fresno 2008
📡 Fetching Fresno - 2009...
✅ Added data for Fresno 2009
📡 Fetching Fresno - 2010...
✅ Added data for Fresno 2010
📡 Fetching Fresno - 2011...
✅ Added data for Fresno 2011
📡 Fetching Fresno - 2012...
✅ Added data for Fresno 2012
📡 Fetching Fresno - 2013...
✅ Added data for Fresno 2013
📡 Fetching Fresno - 2014...
✅ Added data for Fresno 2014
📡 Fetching Fresno - 2015...
✅ Added data for Fresno 2015
📡 Fetching Fresno - 2016...
✅ A

Note:

Madera county data missing: from 1999 to 2009

In [None]:

# columns to keep
# cols_keep = ['county_name', 'date_local', 'aqi', 'site_number', 'latitude', 'longitude']

In [5]:
# 📂 Clean and Smooth PM2.5 AQI Data for SJV Counties

import pandas as pd
import numpy as np

# --------------------------------------------
# 🧩 Function: Clean & smooth AQI per county
# --------------------------------------------
def clean_and_smooth_aqi(df: pd.DataFrame, county_name: str) -> pd.DataFrame:
    """
    Cleans and smooths AQI time series for a given county by:
    1. Aggregating monitor-level daily AQI
    2. Filling missing daily values by interpolation
    3. Smoothing outliers using IQR-based replacement with rolling mean

    Parameters:
        df (pd.DataFrame): Combined AQI dataset with all counties
        county_name (str): The county to extract and process

    Returns:
        pd.DataFrame: Cleaned and smoothed AQI with 'date_local' as index
    """
    # Filter for county and compute daily average AQI across monitors
    df = df[df['county_name'] == county_name]
    df = df.groupby(['date_local', 'site_number'])['aqi'].mean().reset_index()
    df = df.groupby('date_local')['aqi'].mean().reset_index()
    df['date_local'] = pd.to_datetime(df['date_local'])
    df = df.set_index('date_local')

    # Reindex to full daily date range and interpolate missing values
    full_index = pd.date_range(start='1999-01-01', end='2024-12-31', freq='D')
    df = df.reindex(full_index)
    df['aqi'] = df['aqi'].interpolate(method='time')
    df['aqi'] = df['aqi'].ffill().bfill()

    # Smooth outliers using IQR method + rolling mean
    q1 = df['aqi'].quantile(0.25)
    q3 = df['aqi'].quantile(0.75)
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    rolling_mean = df['aqi'].rolling(window=7, center=True, min_periods=1).mean()
    df['aqi_smoothed'] = np.where(
        (df['aqi'] < lower) | (df['aqi'] > upper),
        rolling_mean,
        df['aqi']
    )

    df['county_name'] = county_name
    return df[['aqi_smoothed', 'county_name']]


# --------------------------------------------
# 🔁 Run cleaning on all counties
# --------------------------------------------
# Load combined raw PM2.5 data
all_df = pd.read_csv("pm25_sjv_combined.csv")
all_df['date_local'] = pd.to_datetime(all_df['date_local'])

# Process each county and concatenate results
cleaned_list = []
for county in all_df['county_name'].unique():
    cleaned_df = clean_and_smooth_aqi(all_df, county)
    cleaned_df.index.name = 'date_local'
    cleaned_list.append(cleaned_df)

final_df = pd.concat(cleaned_list).reset_index()

# Save cleaned and smoothed dataset
final_df.to_csv("pm25_sjv_combined_cleaned.csv", index=False)
print("✅ Saved cleaned & smoothed AQI to pm25_sjv_cleaned.csv")

  all_df = pd.read_csv("pm25_sjv_combined.csv")


✅ Saved cleaned & smoothed AQI to pm25_sjv_cleaned.csv
