In [2]:
import os
import pandas as pd
from IPython.display import display

RAW_DIR = "data/raw/airnow"
PROCESSED_DIR = "data/processed"
OUT_PATH = os.path.join(PROCESSED_DIR, "airnow_daily.csv")

def build_airnow_daily_from_local():
    files = [
        os.path.join(RAW_DIR, f)
        for f in os.listdir(RAW_DIR)
        if f.endswith(".csv")
    ]

    if not files:
        print("No CSV files found in data/raw/airnow/")
        return None

    dfs = []
    for path in sorted(files):
        try:
            df = pd.read_csv(path)
            dfs.append(df)
        except Exception as e:
            print(f"Failed to read {path}: {e}")

    if not dfs:
        print("No valid CSV files were read.")
        return None

    raw = pd.concat(dfs, ignore_index=True)

    raw["DateObserved"] = pd.to_datetime(raw["DateObserved"], errors="coerce").dt.date
    raw = raw.dropna(subset=["DateObserved"])

    pivot = (
        raw.pivot_table(
            index="DateObserved",
            columns="ParameterName",
            values="AQI",
            aggfunc="mean"
        )
        .reset_index()
    )
    pivot.columns.name = None
    pivot = pivot.rename(columns={"DateObserved": "date"})

    meta = raw.groupby("DateObserved").agg(
        reporting_area=("ReportingArea", "first"),
        state_code=("StateCode", "first")
    ).reset_index()
    meta = meta.rename(columns={"DateObserved": "date"})

    daily = meta.merge(pivot, on="date", how="left")

    os.makedirs(PROCESSED_DIR, exist_ok=True)
    daily.to_csv(OUT_PATH, index=False)
    print(f"Saved: {OUT_PATH}, shape={daily.shape}")
    display(daily.head())

    return daily

daily_airnow = build_airnow_daily_from_local()


Saved: data/processed/airnow_daily.csv, shape=(14, 5)


Unnamed: 0,date,reporting_area,state_code,OZONE,PM2.5
0,2023-06-01,Northeast Urban,NJ,80.0,64.0
1,2023-06-02,Northeast Urban,NJ,126.0,71.0
2,2023-06-03,Northeast Urban,NJ,26.0,59.0
3,2023-06-04,Northeast Urban,NJ,31.0,22.0
4,2023-06-05,Northeast Urban,NJ,36.0,55.0


In [3]:
import os
import pandas as pd
from IPython.display import display

PROCESSED_DIR = "data/processed"
AIRNOW_PATH = os.path.join(PROCESSED_DIR, "airnow_daily.csv")
AIRNOW_CLEAN_PATH = os.path.join(PROCESSED_DIR, "airnow_daily_clean.csv")

def load_airnow_daily(path=AIRNOW_PATH):
    df = pd.read_csv(path)
    print("Raw AirNow daily shape:", df.shape)
    display(df.head())
    return df

def assess_airnow_quality(df):
    print("\n=== Dtypes ===")
    print(df.dtypes)
    print("\n=== Missing values per column ===")
    print(df.isna().sum())
    print("\n=== Summary statistics for AQI columns ===")
    aq_cols = [c for c in df.columns if c.upper() in ["OZONE", "PM2.5", "PM25", "PM_2_5"]]
    display(df[aq_cols].describe(include="all"))

def clean_airnow_daily(df):
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna(subset=["date"])
    aq_cols = [c for c in df.columns if c.upper() in ["OZONE", "PM2.5", "PM25", "PM_2_5"]]

    for col in aq_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    for col in aq_cols:
        df.loc[(df[col] < 0) | (df[col] > 500), col] = pd.NA

    if aq_cols:
        df = df.dropna(subset=aq_cols, how="all")

    df = df.sort_values("date").reset_index(drop=True)
    print("\nCleaned AirNow daily shape:", df.shape)
    display(df.head())
    return df

os.makedirs(PROCESSED_DIR, exist_ok=True)
air_raw = load_airnow_daily()
assess_airnow_quality(air_raw)
air_clean = clean_airnow_daily(air_raw)
air_clean.to_csv(AIRNOW_CLEAN_PATH, index=False)
print("\nSaved cleaned AirNow daily to:", AIRNOW_CLEAN_PATH)


Raw AirNow daily shape: (14, 5)


Unnamed: 0,date,reporting_area,state_code,OZONE,PM2.5
0,2023-06-01,Northeast Urban,NJ,80.0,64.0
1,2023-06-02,Northeast Urban,NJ,126.0,71.0
2,2023-06-03,Northeast Urban,NJ,26.0,59.0
3,2023-06-04,Northeast Urban,NJ,31.0,22.0
4,2023-06-05,Northeast Urban,NJ,36.0,55.0



=== Dtypes ===
date               object
reporting_area     object
state_code         object
OZONE             float64
PM2.5             float64
dtype: object

=== Missing values per column ===
date              0
reporting_area    0
state_code        0
OZONE             0
PM2.5             0
dtype: int64

=== Summary statistics for AQI columns ===


Unnamed: 0,OZONE,PM2.5
count,14.0,14.0
mean,50.142857,87.428571
std,27.539781,56.58311
min,26.0,22.0
25%,33.75,58.25
50%,40.5,62.0
75%,49.5,87.0
max,126.0,228.0



Cleaned AirNow daily shape: (14, 5)


Unnamed: 0,date,reporting_area,state_code,OZONE,PM2.5
0,2023-06-01,Northeast Urban,NJ,80.0,64.0
1,2023-06-02,Northeast Urban,NJ,126.0,71.0
2,2023-06-03,Northeast Urban,NJ,26.0,59.0
3,2023-06-04,Northeast Urban,NJ,31.0,22.0
4,2023-06-05,Northeast Urban,NJ,36.0,55.0



Saved cleaned AirNow daily to: data/processed/airnow_daily_clean.csv
