In [None]:
library(tidyverse)
library(lubridate)
library(forecast)
library(car)
library(boot)
library(zoo)
library(factoextra)
cities <- c(
  "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
  "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"
)
set.seed(5100)

In [None]:

pm25_raw <- read_csv("../data/daily_88101_2023.csv")
o3_raw   <- read_csv("../data/daily_44201_2023.csv")
no2_raw  <- read_csv("../data/daily_42602_2023.csv")

clean_pollutant <- function(df, pollutant_name) {
  df %>%
    filter(`City Name` %in% cities) %>%          # filter ROWS by city name
    mutate(Date = ymd(`Date Local`)) %>%         # convert date
    select(`City Name`, Date, `Arithmetic Mean`) %>%  # select correct columns
    rename(
      City = `City Name`,
      !!pollutant_name := `Arithmetic Mean`
    )
} 

pm25 <- clean_pollutant(pm25_raw, "PM25")
o3   <- clean_pollutant(o3_raw,   "O3")
no2  <- clean_pollutant(no2_raw,  "NO2")

air_df <- pm25 %>%
  inner_join(o3, by = c("City", "Date")) %>%
  inner_join(no2, by = c("City", "Date"))

# There are duplicate rows as the indicators have multiple monitoring sites per city. 
air_df_clean <- air_df %>%
  group_by(City, Date) %>%
  summarise(
    PM25 = mean(PM25, na.rm = TRUE),
    O3   = mean(O3,   na.rm = TRUE),
    NO2  = mean(NO2,  na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(City, Date)

[1mRows: [22m[34m847057[39m [1mColumns: [22m[34m29[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (16): State Code, County Code, Site Num, Datum, Parameter Name, Sample ...
[32mdbl[39m  (11): Parameter Code, POC, Latitude, Longitude, Observation Count, Obse...
[34mdate[39m  (2): Date Local, Date of Last Change

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m388292[39m [1mColumns: [22m[34m29[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (17): State Code, County Code, Site Num, Datum, Parameter Name, Sample ...
[32mdbl[39m  (10): Parameter Code, POC, Latitude, Longitude, Observation Count, Obse...
[34mdate[39m  (2

[90m# A tibble: 6 × 5[39m
  City    Date        PM25     O3   NO2
  [3m[90m<chr>[39m[23m   [3m[90m<date>[39m[23m     [3m[90m<dbl>[39m[23m  [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m
[90m1[39m Chicago 2023-03-01  9.17 0.026[4m4[24m 15.8 
[90m2[39m Chicago 2023-03-02  2.8  0.036[4m5[24m  9.47
[90m3[39m Chicago 2023-03-04 16.1  0.021[4m1[24m 19.6 
[90m4[39m Chicago 2023-03-07  3.15 0.038[4m1[24m  6.37
[90m5[39m Chicago 2023-03-10  3.3  0.033[4m9[24m 10.0 
[90m6[39m Chicago 2023-03-13  4.18 0.032[4m3[24m 11.9 



Data contains 3,367 rows instead of 3,650 because some cities have missing days, some monitoring stations fail to record, etc.