In [75]:
library(rnoaa)
library(tidyverse)
library(lubridate)
library(rlang)
library(stringr)

In [76]:
stations <- ghcnd_stations()

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/ghcnd-stations.rds

date created (size, mb): 2022-02-12 20:25:20 (2.159)

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/ghcnd-inventory.rds

date created (size, mb): 2022-02-12 20:26:28 (2.669)



In [None]:
names(ghcnd_search(stationid = "USC00186350", date_min = "1950-01-01", date_max = "2022-01-31"))
names(ghcnd_search(stationid = "GME00127786", date_min = "1950-01-01", date_max = "2022-01-31"))
names(ghcnd_search(stationid = "JA000047759", date_min = "1950-01-01", date_max = "2022-01-31"))
names(ghcnd_search(stationid = "CA001108395", date_min = "1950-01-01", date_max = "2022-01-31"))

In [77]:
# load df. don't change df names as they will be used as input of function

kyoto <- read.csv("raw/kyoto.csv")
liestal <- read.csv("raw/liestal.csv")
washingtondc <- read.csv("raw/washingtondc.csv")

kyoto<- kyoto|> 
  mutate(year = strtoi(substr(year, 1, 4)))

japan <- read.csv("raw/japan.csv")
swiss <- read.csv("raw/meteoswiss.csv")
south_korea <- read.csv("raw/south_korea.csv")

In [78]:
# filtering the city data to roughly match station id geographically
# using 1 decimal as approximation

stations <- stations |> 
  mutate(latr = round(latitude, 1), longr = round(longitude, 1))

# need to update to follow DRY

japan <- japan |> 
  mutate(latr = round(lat, 1), longr = round(long, 1))
swiss <- swiss |> 
  mutate(latr = round(lat, 1), longr = round(long, 1))
south_korea <- south_korea |> 
  mutate(latr = round(lat, 1), longr = round(long, 1))


In [79]:
# retrieve approximate station id and row bind them

japan_id <- merge(japan, stations, by = c("latr", "longr"))
swiss_id <- merge(swiss, stations, by = c("latr", "longr"))
south_korea <- merge(south_korea, stations, by = c("latr", "longr"))
all <- rbind(japan_id, swiss_id, south_korea)

In [80]:
# combine every location together

all <- all |> 
  select(id, location, lat, long, alt, year, bloom_date, bloom_doy)

In [81]:
# add id to three prediction cities

kyoto <- kyoto |> 
  mutate(id = "JA000047759")
washingtondc <- washingtondc |> 
  mutate(id = "USC00186350")
liestal <- liestal |> 
  mutate(id = "GME00127786")

all_pred <- rbind(kyoto, washingtondc, liestal) |> 
  select(id, location, lat, long, alt, year, bloom_date, bloom_doy)

In [257]:
all <- rbind(all, all_pred) |> 
    distinct()

In [258]:
# retrieve unique station ids for all location and para list

id_list <- unique(all$id)
para_list <- c("tmax", "tmin", "prcp", "swnd")

In [259]:
# left join climate data to cherry data

result <- all

for (para in para_list){
    for (id in id_list){
        df <- ghcnd_search(stationid = id, date_min = "1950-01-01", date_max = "2022-01-31")[[para]]
        if (is.null(df) == FALSE){
            tr <- df |> 
                mutate(year = as.integer(year(date)),
                    !!sym(para) := !!sym(para) / 10) |>
                drop_na() |>
                select(id, !!sym(para), year) |> 
                group_by(year, id) |> 
                summarise(!!sym(para) := mean(!!sym(para), na.rm = TRUE))
            result <- left_join(result, tr, by = c("year", "id"))
            
            para1 <- paste(para, ".x", sep = "")
            para2 <- paste(para, ".y", sep = "")
            if (para1 %in% names(result)){
                result <- result |> 
                    mutate(!!sym(para) := coalesce(!!sym(para1), !!sym(para2))) |> 
                    select(-!!sym(para1), -!!sym(para2))
            }

        }

    }
}

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JAM00047918.dly

date created (size, mb): 2022-02-21 23:37:19 (1.187)

file min/max dates: 1973-01-01 / 2021-10-31

`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JA000047912.dly

date created (size, mb): 2022-02-22 00:04:24 (0.686)

file min/max dates: 1957-01-01 / 1989-12-31

`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JAM00047927.dly

date created (size, mb): 2022-02-22 00:04:50 (1.185)

file min/max dates: 1973-01-01 / 2021-10-31

`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JAW00042206.dly

date created (size, mb): 2022-02-22 00:04:54 (0.973)

file min/max dates: 1949-05-01 / 1968

In [261]:
result <- result|> 
    drop_na()

In [263]:
write.csv(result,"processed/clean.csv", row.names = FALSE)