## Los Angeles Plot
```{r}
# city_name <- "Los Angeles"

# la_df <- air_df %>%
#   filter(City == city_name) %>%
#   arrange(Date)

# la_long <- la_df %>%
#   pivot_longer(cols = c(PM25, O3, NO2),
#                names_to = "Pollutant",
#                values_to = "Value")

# Filter only LA
# Convert to long format
# three pollutant cols -> one col
la_long <- air_df_clean %>%
  filter(City == "Los Angeles") %>%
  pivot_longer(cols = c(PM25, O3, NO2),
               names_to = "Pollutant",
               values_to = "Value")

# Plot 3 pollutants
ggplot(la_long, aes(x = Date, y = Value)) +
  geom_line(color = "steelblue") +
  facet_wrap(~ Pollutant, scales = "free_y", ncol = 1) +
  labs(
    title = "Daily Pollution Levels in Los Angeles (2023)",
    x = "Date",
    y = "Concentration"
  ) +
  theme_minimal(base_size = 14)
```

<!-- -------------------------------------------------- -->
## Format Data

### Create Data Frame
```{r}
pm25_matrix <- air_df_clean %>%
  select(City, Date, PM25) %>%
  pivot_wider(names_from = Date, values_from = PM25) %>%
  column_to_rownames("City")

o3_matrix <- air_df_clean %>%
  select(City, Date, O3) %>%
  pivot_wider(names_from = Date, values_from = O3) %>%
  column_to_rownames("City")

no2_matrix <- air_df_clean %>%
  select(City, Date, NO2) %>%
  pivot_wider(names_from = Date, values_from = NO2) %>%
  column_to_rownames("City")

# Make a complete matrix with all three pollutants
ts_matrix <- cbind(pm25_matrix, o3_matrix, no2_matrix)
```

### Scale Data
```{r}
# Scale to zero mean and 1 standard deviation (Z-score)
ts_scaled <- scale(ts_matrix)

# df with number of na values
na_col_sums <-colSums(is.na(ts_matrix))
```

## Impute Missing Values
```{r}
# km3 <- kmeans(ts_scaled, centers = 3, nstart = 25)
# km3$cluster
# Above code does not work because NAs and k-means doesn't work well with null values, so we are going to replace the na values with the mean.
ts_matrix_imputed <- ts_matrix %>% 
  apply(1, function(row) na.approx(row, na.rm = FALSE)) %>%
  t()
ts_matrix_imputed <- apply(ts_matrix_imputed, 1, function(row){
  row[is.na(row)] <- mean(row, na.rm = TRUE)
  row
}) %>% t()

sum(is.na(ts_matrix_imputed)) # should be 0

# Rescale
ts_scaled <- scale(ts_matrix_imputed)
```

## Clustering
```{r}
km3 <- kmeans(ts_scaled, centers = 3, nstart = 25)
km3$cluster
```

```{r}
fviz_cluster(km3, data = ts_scaled, geom = "point", repel = TRUE)
```

## ARIMA 

### Test: Is ARIMA valid?
```{r}
# tseries::adf.test(ts_series)
# ndiffs(ts_series)
# nsdiffs(ts_series)
# checkresiduals(fit)

```

### ARIMA Modeling, Ljung-Box Test
```{r}
# Cities
cities <- c(
  "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
  "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"
)

# Pollutants
pollutants <- c("PM25", "O3", "NO2")

# Storage lists
arima_models <- list()
arima_diagnostics <- list()
arima_forecasts <- list()

# Loop through each city and pollutant
for (city in cities) {
  cat("\n===========================\n")
  cat("CITY:", city, "\n")
  cat("===========================\n\n")

  city_df <- air_df_clean %>% 
    filter(City == city) %>% 
    arrange(Date)

  for (poll in pollutants) {

    cat("---- Pollutant:", poll, "----\n")

    # Extract time series
    ts_data <- city_df[[poll]]

    # Convert to a daily ts object
    ts_series <- ts(ts_data, frequency = 365)

    # Fit ARIMA model
    fit <- auto.arima(ts_series, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)

    # Save model
    arima_models[[paste(city, poll, sep = "_")]] <- fit

    # Diagnostics
    diag <- checkresiduals(fit, plot = FALSE)
    arima_diagnostics[[paste(city, poll, sep = "_")]] <- diag

    # Forecast 30 days ahead
    fcast <- forecast(fit, h = 30)
    arima_forecasts[[paste(city, poll, sep = "_")]] <- fcast

    # Print important results
    cat("ARIMA Model:", fit$arma, "\n")
    cat("AIC:", fit$aic, "\n")
    cat("Residual Ljung-Box p-value:", diag$p.value, "\n\n")
  }
}
```

This test shows us that we failed many tests. failing Ljung-Box is extremely common in meterological and pollutant time series. this because the data has strong seasonality, periodic cycles (ARIMA can't model weather shocks), long-memory behavior, non-linear patterns and calendar effects (weekday traffic/holiday). But what about the LJUNG tests above 0.05?
