# University Certificate in Artificial Intelligence (Hands on AI, Third Challenge, 2022-2023, UMONS)
# Introduction to time series analysis and forecasting




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 5]


## White noise time series



- Generate a time series with 500 observations from a white noise process with zero mean and unit standard deviation.



In [None]:
# Hint: use np.random.normal

white_noise = np.random.normal(loc=0, scale=1, size=500)


- Plot the generated tiime series.


In [None]:
plt.plot(white_noise)


- Compute and plot the ACF for 50 lags. Did you expect to see such results? Why?

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
# Hint: use plot_acf

plot_acf(white_noise, lags=50)


* Perform a Ljung-Box test for the first ten lags.

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox
# Hint: use acorr_ljungbox

acorr_ljungbox(white_noise, lags = 10)

## Real-world time series

In [None]:

# Read the data file
DF = pd.read_csv("../data/data_train.csv", parse_dates = True)


DF['Day'] =  pd.to_datetime(DF['Day'], format='%Y-%m-%d')
DF.set_index("Day", inplace=True)
DF = DF.asfreq("D")
print(DF)


In [None]:
print(DF.shape)


- Compute the number of missing values per series.


In [None]:
# Hint: use isna()

print(DF.isna().sum().sort_values(ascending=False))

- Replace the missing values with the method of your choice.


In [None]:
# Hint: you can use fillna()
DF.fillna(method = "backfill", inplace = True)
print(DF.isna().sum().sort_values(ascending=False))


- Select one series (among "series-001", "series-002", ..., "series-111") and plot it.


In [None]:
# Select one series (among "series-001", "series-002", ..., "series-111") and plot it
my_series = "series-003"
DF[my_series].plot()

In [None]:
# Extract calendar variables from dates (useful for seasonal plots)
DF["d"] = DF.index.day.to_numpy()
DF["m"] = DF.index.month.to_numpy()
DF["y"] = DF.index.year.to_numpy()
DF["w"] = DF.index.weekday.to_numpy()
DF["wy"] = DF.index.isocalendar().week.to_numpy()
DF.head()

- Generate a seasonal plot with the day of the week in the x-axis.

In [None]:
# Seasonal plots (Day of the week)
# Hint: You could generate a data frame with the weekly series and plot it

patterns_day_week = DF[[my_series, "wy", "y", "w"]].pivot_table(index=['w'], columns=['wy', 'y'])
plt.plot(patterns_day_week)




- Plot a boxplot for each day of the week.

In [None]:
patterns_day_week.transpose().boxplot()

- Generate a seasonal plot with the day of the month in the x-axis.

In [None]:
# Seasonal plots (Day of the month)
patterns_day_month = DF[[my_series, "d", "m", "y"]].pivot_table(index=['d'], columns=['m', 'y'])
plt.plot(patterns_day_month)
plt.show()



In [None]:
patterns_day_month.transpose().boxplot()

- Produce lagged scatterplots for lags 1, 3 and 7. What do you observe? Add the diagonal for a better visualization.

In [None]:
# Lag plot

def lag_plot(series, lag = -1, ls = 'r.'):
    y_lag = series.shift(lag)
    plt.plot(series, series, 'b-')
    plt.plot(y_lag, series, ls)
    plt.show()

lag_plot(DF[my_series], lag = 1)
lag_plot(DF[my_series], lag = 3)
lag_plot(DF[my_series], lag = 7)



## Autocorrelation
* Plot the autocorrelation function (ACF) for the first 20 lags, and interpret the results. 



In [None]:
DF[my_series].plot()


In [None]:
plot_acf(DF[my_series], lags= 20, alpha=0.05)
plt.show()


* Recompute the ACF after applying a seasonal difference. 


In [None]:
# Hint: use np.diff
plot_acf(np.diff(DF[my_series], 7), lags= 20, alpha=0.05)
plt.show()

- Compute and print the ACF values for the first 20 lags.

In [None]:
from statsmodels.graphics.tsaplots import acf
# Hint: use acf
print(acf(DF[my_series]))

* Perform a Ljung-Box test for the series.

In [None]:
acorr_ljungbox(DF[my_series], period = 7)


## Transformations

* Apply a Box-Cox transformation with $\lambda = 0.5$, $\lambda = 0.3$ and $\lambda = 0$. Plot the transformed series.

In [None]:
DF[my_series].transform(lambda x: x ** 0.5).plot()
plt.show()
DF[my_series].transform(lambda x: x ** 0.333).plot()
plt.show()
DF[my_series].transform(lambda x: np.log(x)).plot()


- Find the best value of $\lambda$ in the Box-Cox transformation, and plot the transformed series.

In [None]:
from scipy.stats import boxcox
eps = 0.0001
x = DF[my_series] + eps
# Hint: use boxcox on the x variable.

x, opt_lambda = boxcox(x)
print(opt_lambda)
plt.plot(x)

## Time series decomposition 


* Decompose the time series into trend, seasonal and remainder components. Plot the different component. Does it help you to understand the data?

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
# Hint: use seasonal_decompose

result = seasonal_decompose(DF[my_series])


In [None]:
result.plot()

* Use the STL decomposition algorithm to decompose the series into trend, seasonal and remainder components.

In [None]:
from statsmodels.tsa.seasonal import STL
# Hint: Use STL with period = 7

stl = STL(DF[my_series], period = 7, robust = True, seasonal = 7*30 + 1)
result = stl.fit()
result.plot()






- Plot the deseasonalized series, i.e. $z_t = y_t - s_t$.

In [None]:
# Deaseasonlized data
deaseasonlized_data = result.trend + result.resid
deaseasonlized_data.plot()




## Forecasting

Split the time series in a training and a test set where the test set is composed of the last 21 observations.

In [None]:
series = DF[my_series]

n_test = 21
series_train = series[:-n_test]
series_test = series[-n_test:]

plt.plot(series_train)
plt.plot(series_test)

* Compute the in-sample one-step ahead predictions for simple forecasting methods (mean, naive, and sesonal naive).

In [None]:
## Mean forecasts

fit_mean = [series_train[:i].mean() for i in range(1, len(series_train)) ]
resid_mean = series_train.values[1:] - fit_mean

In [None]:

## Naive forecasts

fit_naive = [series_train[i-1] for i in range(1, len(series_train)) ]
resid_naive = series_train.values[1:] - fit_naive

In [None]:

## Seasonal naive forecasts
# For the first week, you can compute a naive forecast (non-seasonal)

fit_snaive = [series_train[i-7] for i in range(7, len(series_train)) ]
fit_snaive = fit_naive[:6] + fit_snaive
resid_snaive = series_train.values[1:] - fit_snaive

* Plot a histogram of residuals for the three mthods.

In [None]:
# 
resid = pd.DataFrame({'resid_mean': resid_mean, 'resid_naive': resid_naive, 'resid_snaive': resid_snaive})
resid.plot.hist( bins = 20, histtype = "step")

* Compute the bias for each method. Which method has a higher bias?


In [None]:
#
resid.mean()

* Compute the mean squared error (MSE).


In [None]:
#
(resid**2).mean()

* Plot the ACF for the first 20 lags. Which method has a better fit and why?


In [None]:
# Plot the ACF for the first 20 lags. Which method has a better fit and why?
plot_acf(resid_mean, lags= 20, alpha=0.05, title = "mean")
plot_acf(resid_naive, lags= 20, alpha=0.05, title = "naive")
plot_acf(resid_snaive, lags= 20, alpha=0.05, title = "seasonal naive")

* For each method, plot the predictions and the true in-sample values.


In [None]:
fit_mean_df = pd.DataFrame(fit_mean, index = series_train.index[1:])
fit_naive_df = pd.DataFrame(fit_naive, index = series_train.index[1:])
fit_snaive_df = pd.DataFrame(fit_snaive, index = series_train.index[1:])

plt.figure()
plt.plot(series_train)
plt.plot(fit_mean_df, color='orange')

plt.figure()
plt.plot(series_train)
plt.plot(fit_naive_df, color='green')

plt.figure()
plt.plot(series_train)
plt.plot(fit_snaive_df, color='red')

* Implement a new forecasting method which computes the forecast for $y_{t}$ by taking the average of $y_{t-1}, y_{t-7}, y_{t-14}$.
For the first two weeks, you can use the seasonal naive forecasts.


In [None]:
# 
fit_snaive2 = [(series_train[i-1] + series_train[i-7] + series_train[i-14])/3 for i in range(14, len(series_train)) ]
fit_snaive2 = fit_snaive[:13] + fit_snaive2
resid_snaive2 = series_train.values[1:] - fit_snaive2



Compare this new method with the seasonal naive method (e.g. histogram of residuals and ACF plot).

In [None]:
#
resid = pd.DataFrame({'resid_snaive': resid_snaive, 'resid_snaiv2e': resid_snaive2})
resid.plot.hist( bins = 20, histtype = "step")

plot_acf(resid_snaive, lags= 20, alpha=0.05, title = "seasonal naive")
plot_acf(resid_snaive2, lags= 20, alpha=0.05, title = "new method")

print( (resid**2).mean() )



* Compute $21$-step ahead out-of-sample forecasts for the different methods.
* Plot the forecasts and the true values.

In [None]:
# Out-of-sample forecasts

period = 7
T = len(series_train)
HORIZON = n_test

## Mean
meanf = series_train.mean()
f_mean = pd.DataFrame([meanf for h in range(0, HORIZON) ], index = series_test.index)

## Naive
f_naive = series_train[-1]
f_naive = pd.DataFrame([f_naive for h in range(0, HORIZON) ], index = series_test.index)


## Seasonal naive
#f_snaive = [series_train[T + h - period * ((HORIZON -1)//period + 1)] for h in range(0, HORIZON) ]
f_snaive = [series_train[-HORIZON+h] for h in range(0, HORIZON) ]
f_snaive = pd.DataFrame(f_snaive, index = series_test.index)

plt.plot(series_test, label='true')
plt.plot(f_mean, label='mean')
plt.plot(f_naive, label='naive')
plt.plot(f_snaive, label='snaive')
plt.legend(loc='upper right')

