In [4]:
suppressMessages(library(tidyverse))
suppressMessages(library(tsibble))
suppressMessages(library(fable))
suppressMessages(library(tsibble))
suppressMessages(library(fabletools))
suppressMessages(library(feasts))
suppressMessages(library(lubridate))
suppressMessages(library(scales))
suppressMessages(library(fpp3))
suppressMessages(library(gridExtra))
suppressMessages(library(Mcomp))

"package 'Mcomp' was built under R version 4.3.3"


## Practical Issues

### Very Short Time Series

In [7]:
m3totsibble <- function(z) {
  bind_rows(
    as_tsibble(z$x) |> mutate(Type = "Training"),
    as_tsibble(z$xx) |> mutate(Type = "Test")
  ) |>
    mutate(
      st = z$st,
      type = z$type,
      period = z$period,
      description = z$description,
      sn = z$sn
    ) |>
    as_tibble()
}

short <- Mcomp::M3 |>
  subset("yearly") |>
  purrr::map_dfr(m3totsibble) |>
  group_by(sn) |>
  mutate(n = max(row_number())) |>
  filter(n <= 20) |>
  ungroup() |>
  as_tsibble(index = index, key = c(sn, period, st))

In [8]:
short_fit <- short |>
  model(arima = ARIMA(value))

In [12]:
short_fit %>% head(10)

sn,period,st,arima
<chr>,<chr>,<chr>,<model>
N0001,YEARLY,Y1,"<ARIMA(1,2,0)>"
N0002,YEARLY,Y2,"<ARIMA(1,0,0) w/ mean>"
N0003,YEARLY,Y3,"<ARIMA(2,0,0) w/ mean>"
N0004,YEARLY,Y4,"<ARIMA(0,1,0)>"
N0005,YEARLY,Y5,"<ARIMA(1,0,0) w/ mean>"
N0006,YEARLY,Y6,"<ARIMA(0,1,0) w/ drift>"
N0007,YEARLY,Y7,"<ARIMA(0,1,0)>"
N0008,YEARLY,Y8,"<ARIMA(0,1,1)>"
N0009,YEARLY,Y9,"<ARIMA(0,1,0) w/ drift>"
N0010,YEARLY,Y10,"<ARIMA(2,0,0)>"


### Missing Values

In [None]:
high_yield_bond_tr <- read.csv('data/BAMLHYH0A0HYM2TRIV.csv')
high_yield_bond_tr %>% head()

Convert to a tsibble and perform assessment of missing values.

In [None]:
high_yield_bond_tr <- high_yield_bond_tr %>%
rename(RETURN_INDEX = BAMLHYH0A0HYM2TRIV) %>%
mutate(RETURN_INDEX = as.numeric(RETURN_INDEX)) %>%
mutate(DATE = ymd(as.Date(DATE))) %>%
as_tsibble(index=DATE)

high_yield_bond_tr %>% head()

Explicit Missing Values

In [None]:
high_yield_bond_tr %>%
filter(is.na(RETURN_INDEX)) %>%
head()

Implicit Missing Values

In [None]:
high_yield_bond_tr %>% scan_gaps() %>% head()

In [None]:
high_yield_bond_tr <- high_yield_bond_tr %>%
fill_gaps()

high_yield_bond_tr %>% head()

### Filling Missing Values

### Finding and Filling Outliers

### Dealing with Weekly Data
- Difficult to work with since seasonal period is large and non-integer.  
- Most years have 52 weeks, but there is the occasional 53 weeks.
- Starts on a different day each year.
- Weeks will start on different days.
- Simplest approach is using STL decomposition then a non-seasonal method.

In [None]:
my_dcmp_spec <- decomposition_model(
  STL(Barrels),
  ETS(season_adjust ~ season("N"))
)

us_gasoline |>
  model(stl_ets = my_dcmp_spec) |>
  forecast(h = "2 years") |>
  autoplot(us_gasoline) +
  labs(y = "Millions of barrels per day",
       title = "Weekly US gasoline production")

Alternatively, harmonic regression can be used
Finding `K` can be computationally expensive though.

In [None]:
gas_dhr <- us_gasoline |>
  model(dhr = ARIMA(Barrels ~ PDQ(0, 0, 0) + fourier(K = 6)))
gas_dhr |>
  forecast(h = "2 years") |>
  autoplot(us_gasoline) +
  labs(y = "Millions of barrels per day",
       title = "Weekly US gasoline production")

- STL is better if the seasonality is changing over time.  
- Harmonic will be better if you have additional regressors to include.

### Daily and Sub-Daily
- Multiple seasonal periods  
- If time series is short there may be only one seasonal pattern  
- If long enough use STL, harmonic regression, or Prophet.  
- Seasonality with moving holidays will be more challenging, those will need to be handled with dummmy variables using ARIMA or Prophet.

### Modeling Counts
- Discussed models assume the data is continuous.  
- Often data is of counts, e.g., number of products sold, number of people, and the result could only be integers.  
- Generally doesn't matter if the counts are somewhat large, e.g., above 100.  
- For smaller counts, assuming the data is continuous could have implications on forecasting performance.  
- Also need to constrain the forecasts to be positive, i.e., cannot have negative people.
- A method is `Croston's method`, which is really an approximation of count data rather than explicitly a method for handling count data.

Croston's Method:
- Construct two new series $\alpha$ and $q$, where $q_i$ is the ***i***th quantity of non-zero counts and $\alpha_i$ is the time between $q_i$ and $q_{i+1}$.  
- Separate exponential smoothing forecasts for each new series.  
- Usually applied to timing of demand and demand for items.

In [None]:
j06 <- PBS |>
  filter(ATC2 == "J06") |>
  summarise(Scripts = sum(Scripts))

j06 |> autoplot(Scripts) +
  labs(y="Number of scripts",
       title = "Sales for immune sera and immunoglobulins")

In [None]:
j06_croston <- j06 |>
  model(CROSTON(Scripts))

j06_croston

In [None]:
j06_croston %>% forecast(h=12)

### Constraining Forecasts to a Range

In [None]:
egg_prices <- prices |> filter(!is.na(eggs))
egg_prices |>
  model(ETS(eggs ~ trend("A"))) |>
  forecast(h = 50) |>
  autoplot(egg_prices) +
  geom_hline(yintercept = 0) +
  ylim(-100, 400) +
  labs(title = "Annual egg prices (Modeling Egg Price)",
       y = "$US (in cents adjusted for inflation) ")

Model the log to avoid the issue.

In [None]:
egg_prices <- prices |> filter(!is.na(eggs))
egg_prices |>
  model(ETS(log(eggs) ~ trend("A"))) |>
  forecast(h = 50) |>
  autoplot(egg_prices) +
  geom_hline(yintercept = 0) +
  ylim(-100, 400) +
  labs(title = "Annual egg prices (Modeling Log(Egg Price))",
       y = "$US (in cents adjusted for inflation) ")

To constrain, need a custom function

In [None]:
scaled_logit <- function(x, lower = 0, upper = 1) {
  log((x - lower) / (upper - x))
}
inv_scaled_logit <- function(x, lower = 0, upper = 1) {
  (upper - lower) * exp(x) / (1 + exp(x)) + lower
}
my_scaled_logit <- new_transformation(
                    scaled_logit, inv_scaled_logit)
egg_prices |>
  model(
    ETS(my_scaled_logit(eggs, lower = 50, upper = 400)
          ~ trend("A"))
  ) |>
  forecast(h = 50) |>
  autoplot(egg_prices) +
  geom_hline(yintercept = 0) +
  ylim(-100, 400) +
  labs(title = "Annual egg prices",
       y = "$US (in cents adjusted for inflation) ")

### Residual CUSUM

In [None]:
set.seed(1243)
white_noise <- rnorm(120) %>% ts(start=c(2000,1), frequency = 12)
white_noise %>% as_tsibble() %>% ACF() %>% autoplot()

In [None]:
qqplot(trend, white_noise,conf.level = .95)

In [None]:
qqnorm(white_noise)

In [None]:
trend <- 1:120

white_noise_sc <- strucchange::efp(white_noise ~ trend, type='Rec-CUSUM')
white_noise_sc %>% plot()

### Forecast Combinations

### Prediction Intervals for Bagging

### Training and Test Sets