In [1]:
library(rnoaa)
library(tidyverse)
library(lubridate)
library(rlang)
library(stringr)

"package 'rnoaa' was built under R version 4.1.2"
Registered S3 method overwritten by 'httr':
  method           from  
  print.cache_info hoardr

-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.1 --

[32mv[39m [34mggplot2[39m 3.3.5     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.1.6     [32mv[39m [34mdplyr  [39m 1.0.7
[32mv[39m [34mtidyr  [39m 1.2.0     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 2.0.2     [32mv[39m [34mforcats[39m 0.5.1

"package 'tibble' was built under R version 4.1.2"
"package 'tidyr' was built under R version 4.1.2"
-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: 'lubrida

In [2]:
stations <- ghcnd_stations()

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/ghcnd-stations.rds

date created (size, mb): 2022-02-12 20:25:20 (2.159)

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/ghcnd-inventory.rds

date created (size, mb): 2022-02-12 20:26:28 (2.669)



In [4]:
# load df. don't change df names as they will be used as input of function

kyoto <- read.csv("raw/kyoto.csv")
liestal <- read.csv("raw/liestal.csv")
washingtondc <- read.csv("raw/washingtondc.csv")

kyoto<- kyoto|> 
  mutate(year = strtoi(substr(year, 1, 4)))

japan <- read.csv("raw/japan.csv")
swiss <- read.csv("raw/meteoswiss.csv")
south_korea <- read.csv("raw/south_korea.csv")

In [5]:
# filtering the city data to roughly match station id geographically
# using 1 decimal as approximation

stations <- stations |> 
  mutate(latr = round(latitude, 1), longr = round(longitude, 1))

# need to update to follow DRY

japan <- japan |> 
  mutate(latr = round(lat, 1), longr = round(long, 1))
swiss <- swiss |> 
  mutate(latr = round(lat, 1), longr = round(long, 1))
south_korea <- south_korea |> 
  mutate(latr = round(lat, 1), longr = round(long, 1))


In [6]:
# retrieve approximate station id and row bind them

japan_id <- merge(japan, stations, by = c("latr", "longr"))
swiss_id <- merge(swiss, stations, by = c("latr", "longr"))
south_korea <- merge(south_korea, stations, by = c("latr", "longr"))
all <- rbind(japan_id, swiss_id, south_korea)

In [7]:
# combine every location together

all <- all |> 
  select(id, location, lat, long, alt, year, bloom_date, bloom_doy)

In [8]:
# add id to three prediction cities

kyoto <- kyoto |> 
  mutate(id = "JA000047759")
washingtondc <- washingtondc |> 
  mutate(id = "USC00186350")
liestal <- liestal |> 
  mutate(id = "GME00127786")

all_pred <- rbind(kyoto, washingtondc, liestal) |> 
  select(id, location, lat, long, alt, year, bloom_date, bloom_doy)

In [9]:
all <- rbind(all, all_pred) |> 
    distinct()

In [10]:
# retrieve unique station ids for all location and para list

id_list <- unique(all$id)
para_list <- c("tmax", "tmin", "prcp", "swnd")

In [14]:
# left join climate data to cherry data

result <- all

for (para in para_list){
    for (id in id_list){
        df <- ghcnd_search(stationid = id, date_min = "1950-01-01", date_max = "2022-01-31")[[para]]
        if (is.null(df) == FALSE){
            tr <- df |> 
                mutate(year = as.integer(year(date)),
                    !!sym(para) := !!sym(para) / 10) |>
                drop_na() |>
                select(id, !!sym(para), year) |> 
                group_by(year, id) |> 
                summarise(!!sym(para) := mean(!!sym(para), na.rm = TRUE))
            result <- left_join(result, tr, by = c("year", "id"))
            
            para1 <- paste(para, ".x", sep = "")
            para2 <- paste(para, ".y", sep = "")
            if (para1 %in% names(result)){
                result <- result |> 
                    mutate(!!sym(para) := coalesce(!!sym(para1), !!sym(para2))) |> 
                    select(-!!sym(para1), -!!sym(para2))
            }

        }

    }
}

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JAM00047918.dly

date created (size, mb): 2022-02-21 23:37:19 (1.187)

file min/max dates: 1973-01-01 / 2021-10-31

`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JA000047912.dly

date created (size, mb): 2022-02-22 00:04:24 (0.686)

file min/max dates: 1957-01-01 / 1989-12-31

`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JAM00047927.dly

date created (size, mb): 2022-02-22 00:04:50 (1.185)

file min/max dates: 1973-01-01 / 2021-10-31

`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/JAW00042206.dly

date created (size, mb): 2022-02-22 00:04:54 (0.973)

file min/max dates: 1949-05-01 / 1968

In [12]:
# result <- result|> 
    # drop_na()

In [13]:
# write.csv(result,"processed/clean.csv", row.names = FALSE)

In [None]:
# for (para in para_list){
#     for (id in id_list){
#         df <- ghcnd_search(stationid = id, date_min = "1950-01-01", date_max = "2022-01-31")[[para]]
#         if (is.null(df) == FALSE){
#             tr <- df |> 
#                 mutate(year = as.integer(year(date)),
#                     !!sym(para) := !!sym(para) / 10) |>
#                 drop_na() |>
#                 select(id, !!sym(para), year) |> 
#                 group_by(year, id) |> 
#                 summarise(!!sym(para) := mean(!!sym(para), na.rm = TRUE))
#             result <- left_join(result, tr, by = c("year", "id"))
            
#             para1 <- paste(para, ".x", sep = "")
#             para2 <- paste(para, ".y", sep = "")
#             if (para1 %in% names(result)){
#                 result <- result |> 
#                     mutate(!!sym(para) := coalesce(!!sym(para1), !!sym(para2))) |> 
#                     select(-!!sym(para1), -!!sym(para2))
#             }

#         }

#     }
# }

In [68]:
d <- ghcnd_search(stationid = "GME00127786", date_min = "1950-01-01", date_max = "2022-01-31")[["tmax"]]
f <- ghcnd_search(stationid = "GME00127786", date_min = "1950-01-01", date_max = "2022-01-31")[["tmin"]]
mlist <- c(10, 11, 12, 1)
lagnum <- 3

df <- merge(d, f, by = c("date", "id")) |> 
  select(date, id, tmax, tmin) |> 
  mutate(year = year(date), month = month(date), tmax = tmax / 10, tmin = tmin / 10) |> 
  mutate(gdd = (tmax + tmin) / 2 - 0) |> 
  mutate(gdd = case_when(gdd < 0 ~ 0,
                         TRUE ~ gdd)) |> 
  group_by(id, year, month) |> 
  summarise(agdd = sum(gdd)) |> 
  ungroup() |> 
  mutate(mu = case_when(month %in% mlist ~ "yes",
                        TRUE ~ "no")) |> 
  mutate(lag_agdd = lag(agdd, lagnum),
         lag_mu = lag(mu, lagnum)) |> 
  filter(lag_mu == "yes") |> 
  group_by(id, year, lag_mu) |>
  summarise(agdd = sum(lag_agdd))
  

df

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/GME00127786.dly

date created (size, mb): 2022-02-21 03:07:32 (1.559)

file min/max dates: 1953-09-01 / 2021-12-31

using cached file: C:\Users\Maoli\AppData\Local/Cache/R/noaa_ghcnd/GME00127786.dly

date created (size, mb): 2022-02-21 03:07:32 (1.559)

file min/max dates: 1953-09-01 / 2021-12-31

`summarise()` has grouped output by 'id', 'year'. You can override using the `.groups` argument.

`summarise()` has grouped output by 'id', 'year'. You can override using the `.groups` argument.



id,year,lag_mu,agdd
<chr>,<dbl>,<chr>,<dbl>
GME00127786,1954,yes,654.25
GME00127786,1955,yes,750.40
GME00127786,1956,yes,694.25
GME00127786,1957,yes,567.65
GME00127786,1958,yes,618.60
GME00127786,1959,yes,667.90
GME00127786,1960,yes,683.40
GME00127786,1961,yes,684.35
GME00127786,1962,yes,777.15
GME00127786,1963,yes,502.85


In [None]:
# create agdd column and merge to final df