In [2]:
# importeer libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
import seaborn as sns
sns.set_style('whitegrid')
import statsmodels
import scipy
from scipy.stats import norm
import math
import numpy as np
import statsmodels.tsa.seasonal
import statsmodels.graphics.tsaplots
import matplotlib
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="statsmodels.tsa.statespace.sarimax")


In [3]:
df = pd.read_csv('helpdesk.csv')
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
aantallen = df.groupby(['end_date']).size().reset_index(name='aantal')
aantallen[:20]

Unnamed: 0,end_date,aantal
0,2025-04-03,2
1,2025-04-04,2
2,2025-04-05,2
3,2025-04-06,2
4,2025-04-07,3
5,2025-04-08,7
6,2025-04-09,3
7,2025-04-10,1
8,2025-04-11,4
9,2025-04-12,4


In [4]:
def adf_test(timeseries):
    print("Results of Dickey-Fuller Test:")
    dftest = statsmodels.tsa.stattools.adfuller(timeseries, autolag="AIC")
    dfoutput = pd.Series(dftest[0:4], index=["Test Statistic", "p-value", "#Lags Used", "Number of Observations Used",],)
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value
    print(dfoutput)

def kpss_test(timeseries):
    print("Results of KPSS Test:")
    kpsstest = statsmodels.tsa.stattools.kpss(timeseries, regression="c",
    nlags="auto")
    kpss_output = pd.Series(kpsstest[0:3], index=["Test Statistic", "p-value", "Lags Used"])
    for key, value in kpsstest[3].items():
        kpss_output["Critical Value (%s)" % key] = value
    print(kpss_output)


# voer ADF-test uit
adf_test(aantallen['aantal'])
kpss_test(aantallen['aantal'])

Results of Dickey-Fuller Test:
Test Statistic                  -2.542557
p-value                          0.105460
#Lags Used                       6.000000
Number of Observations Used    205.000000
Critical Value (1%)             -3.462658
Critical Value (5%)             -2.875744
Critical Value (10%)            -2.574341
dtype: float64
Results of KPSS Test:
Test Statistic           1.856092
p-value                  0.010000
Lags Used                9.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


look-up table. The actual p-value is smaller than the p-value returned.

  kpsstest = statsmodels.tsa.stattools.kpss(timeseries, regression="c",


In [5]:
# functie voor plotten van time series
def plot_series(df, series_name, lags=40, diff=0, seasonal_diff=0, seasonal_period=12):
    """
    Plot a time series alongside its ACF and PACF with Bartlett bounds.
    Includes options for regular and seasonal differencing.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing multiple time series as columns.
    series_name : str
        Column name of the series to plot.
    lags : int
        Number of lags for ACF/PACF.
    diff : int, default=0
        Number of regular differences to apply.
    seasonal_diff : int, default=0
        Number of seasonal differences to apply.
    seasonal_period : int, default=12
        Seasonal period (e.g., 12 for monthly data with yearly seasonality).
    """
    series = df[series_name]

    # Apply differencing
    for _ in range(diff):
        series = series.diff()
    for _ in range(seasonal_diff):
        series = series.diff(seasonal_period)
    series = series.dropna()
    
    print(adf_test(series))
    print(kpss_test(series))

    # Compute ACF/PACF (drop lag 0)
    acf_vals = acf(series, nlags=lags, fft=False)[1:]
    pacf_vals = pacf(series, nlags=lags, method="ywm")[1:]
    lags_range = np.arange(1, len(acf_vals)+1)

    # Bartlett bounds
    n = len(series)
    conf = 1.96 / np.sqrt(n)

    fig, axes = plt.subplots(1, 3, figsize=(15, 4))

    # Time series
    axes[0].plot(series.index, series.values, color="steelblue")
    axes[0].set_title(f"Series: {series_name} (diff={diff}, seas_diff={seasonal_diff})")

    # ACF
    axes[1].stem(lags_range, acf_vals, basefmt=" ")
    axes[1].hlines([conf, -conf], xmin=0, xmax=lags, colors="red", linestyles="dashed")
    axes[1].axhline(0, color="black", linewidth=0.8)
    axes[1].set_title("ACF-plot")

    # PACF
    axes[2].stem(lags_range, pacf_vals, basefmt=" ")
    axes[2].hlines([conf, -conf], xmin=0, xmax=lags, colors="red", linestyles="dashed")
    axes[2].axhline(0, color="black", linewidth=0.8)
    axes[2].set_title("PACF-plot")

    plt.tight_layout()
    plt.show()

In [6]:
# functie voor maken van time series cross-validation splits
def make_time_series_splits(series, initial_window, horizon=1, step=1, window_type="expanding"):
    """
    Generate time-series cross-validation splits.

    Parameters
    ----------
    series : pd.Series
        Time series (DateTimeIndex or RangeIndex).
    initial_window : int
        Number of observations in the first training window.
    horizon : int
        Forecast horizon (steps ahead).
    step : int
        How many steps to move the origin each iteration.
    window_type : str
        "expanding"  -> training window grows over time
        "sliding"    -> training window has fixed size

    Returns
    -------
    splits : list of tuples
        Each tuple is (train_idx, test_idx), where each is an array of positions.
    """
    n = len(series)
    splits = []

    start = initial_window

    while start + horizon <= n:
        if window_type == "expanding":
            # Training window always starts at 0
            train_idx = np.arange(0, start)

        elif window_type == "sliding":
            # Fixed-size training window
            train_idx = np.arange(start - initial_window, start)

        else:
            raise ValueError("window_type must be 'expanding' or 'sliding'")

        test_idx = np.arange(start, start + horizon)
        splits.append((train_idx, test_idx))

        start += step

    return splits

In [7]:
# gebruikelijke evaluation metrics voor time series forecasting
def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

def mape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / (np.abs(y_true))
    )

In [8]:
# functies voor time series cross-validation
def cv_single_arima(series, order, seasonal_order, splits, metric=rmse):
    """
    Evaluate one ARIMA specification across CV splits.
    
    Returns the average error across folds.
    """
    errors = []

    for train_idx, test_idx in splits:
        train = series.iloc[train_idx]
        test = series.iloc[test_idx]

        model = ARIMA(train, order=order, seasonal_order=seasonal_order)
        fitted = model.fit()

        fc = fitted.forecast(steps=len(test))
        error = metric(test.values, fc.values)
        errors.append(error)

    return np.mean(errors)

def cv_arima_candidates(series, candidates, splits, metric=rmse):
    """
    Evaluate multiple ARIMA candidates.
    
    Parameters
    ----------
    candidates : list of dicts
        Example:
        [
            {"order": (1,1,1), "seasonal_order": (0,1,1,12)},
            {"order": (2,1,0), "seasonal_order": (1,1,0,12)}
        ]
    """
    results = []

    for spec in candidates:
        avg_error = cv_single_arima(
            series,
            order=spec["order"],
            seasonal_order=spec["seasonal_order"],
            splits=splits,
            metric=metric
        )
        results.append({
            "order": spec["order"],
            "seasonal_order": spec["seasonal_order"],
            "cv_error": avg_error
        })

    return pd.DataFrame(results)

In [9]:
# functies voor visualiseren fouten
def horizon_errors(series, order, seasonal_order, H, splits, metric):
    """
    Compute horizon-wise forecast errors for ARIMA using any CV splits.
    
    Parameters
    ----------
    series : pd.Series
    order : tuple
    seasonal_order : tuple
    H : int
        Maximum forecast horizon.
    splits : list of (train_idx, test_idx)
        Output of make_time_series_splits (expanding or sliding).
    """
    errors = {h: [] for h in range(1, H+1)}

    for train_idx, test_idx in splits:
        train = series.iloc[train_idx]
        test = series.iloc[test_idx]

        model = ARIMA(train, order=order, seasonal_order=seasonal_order)
        fitted = model.fit()

        fc = fitted.forecast(steps=H)

        for h in range(1, H+1): 
            e = metric(test.iloc[h-1], fc.iloc[h-1]) 
            errors[h].append(e)

    avg_errors = {h: np.mean(errors[h]) for h in errors}
    return avg_errors, errors

def plot_horizon_boxplot(all_errors):
    """
    Boxplot of horizon-wise forecast errors.
    """
    horizons = list(all_errors.keys())
    data = [all_errors[h] for h in horizons]

    plt.figure(figsize=(10, 5))
    plt.boxplot(data, labels=horizons, showfliers=True)
    plt.xlabel("Forecast horizon (steps ahead)")
    plt.ylabel("")
    plt.title("Forecast error distribution")
    plt.grid(axis='y', alpha=0.3)
    plt.show()

In [13]:
kandidaatmodellen = [
    # Non-seasonal rond jullie beste (4,1,0)
    {"order": (4,1,0), "seasonal_order": (0,0,0,0)},  # beste tot nu toe
    {"order": (3,1,0), "seasonal_order": (0,0,0,0)},
    {"order": (5,1,0), "seasonal_order": (0,0,0,0)},
    {"order": (2,1,0), "seasonal_order": (0,0,0,0)},
    {"order": (6,1,0), "seasonal_order": (0,0,0,0)},

    {"order": (4,1,1), "seasonal_order": (0,0,0,0)},
    {"order": (4,1,2), "seasonal_order": (0,0,0,0)},
    {"order": (3,1,1), "seasonal_order": (0,0,0,0)},
    {"order": (5,1,1), "seasonal_order": (0,0,0,0)},
]

In [14]:
df = pd.read_csv('helpdesk.csv')
df['end_date'] = pd.to_datetime(df['end_date'])

# dagserie met missende dagen op 0
aantallen = (df.groupby('end_date')
               .size()
               .asfreq('D', fill_value=0)
               .rename('aantal'))


In [15]:
series = aantallen  # dit is nu een pd.Series met DateTimeIndex + freq=D
splits = make_time_series_splits(series, initial_window=70, horizon=30, window_type="sliding")
cv_arima_candidates(series, kandidaatmodellen, splits)




Unnamed: 0,order,seasonal_order,cv_error
0,"(4, 1, 0)","(0, 0, 0, 0)",7.696693
1,"(3, 1, 0)","(0, 0, 0, 0)",7.794263
2,"(5, 1, 0)","(0, 0, 0, 0)",7.754671
3,"(2, 1, 0)","(0, 0, 0, 0)",7.938803
4,"(6, 1, 0)","(0, 0, 0, 0)",7.6738
5,"(4, 1, 1)","(0, 0, 0, 0)",7.639246
6,"(4, 1, 2)","(0, 0, 0, 0)",7.674939
7,"(3, 1, 1)","(0, 0, 0, 0)",7.623455
8,"(5, 1, 1)","(0, 0, 0, 0)",7.771287


In [37]:
kandidaatmodellen = [
    {"order": (1,0,1), "seasonal_order": (0,1,1,7)},
    {"order": (0,0,0), "seasonal_order": (1,1,0,7)},
    {"order": (0,0,0), "seasonal_order": (0,1,0,7)},
    
    {"order": (1,0,0), "seasonal_order": (0,1,0,7)},
    {"order": (1,0,0), "seasonal_order": (1,1,0,7)},
    
    {"order": (2,0,1), "seasonal_order": (0,1,0,7)},
    {"order": (1,0,2), "seasonal_order": (0,1,0,7)},
    {"order": (1,0,1), "seasonal_order": (0,1,0,7)},
    
    {"order": (0,0,1), "seasonal_order": (1,1,0,7)},
    {"order": (2,0,1), "seasonal_order": (1,1,0,7)},
    {"order": (0,0,2), "seasonal_order": (0,1,1,7)},

    {"order": (0,0,1), "seasonal_order": (0,1,1,7)},
    {"order": (0,0,2), "seasonal_order": (0,1,1,7)},
    {"order": (1,0,1), "seasonal_order": (0,1,1,7)},
]

In [38]:
series = aantallen
splits = make_time_series_splits(series, initial_window=70, horizon=30, window_type="sliding")
cv_arima_candidates(series, kandidaatmodellen, splits)




Unnamed: 0,order,seasonal_order,cv_error
0,"(1, 0, 1)","(0, 1, 1, 7)",25.180459
1,"(0, 0, 0)","(1, 1, 0, 7)",26.613962
2,"(0, 0, 0)","(0, 1, 0, 7)",31.838496
3,"(1, 0, 0)","(0, 1, 0, 7)",31.882346
4,"(1, 0, 0)","(1, 1, 0, 7)",26.628562
5,"(2, 0, 1)","(0, 1, 0, 7)",31.79339
6,"(1, 0, 2)","(0, 1, 0, 7)",32.260014
7,"(1, 0, 1)","(0, 1, 0, 7)",31.809619
8,"(0, 0, 1)","(1, 1, 0, 7)",26.634469
9,"(2, 0, 1)","(1, 1, 0, 7)",26.837487


In [39]:
kandidaatmodellen = [
    {"order": (1,0,1), "seasonal_order": (0,1,1,30)},
    {"order": (0,0,0), "seasonal_order": (1,1,0,30)},
    {"order": (0,0,0), "seasonal_order": (0,1,0,30)},
    
    {"order": (1,0,0), "seasonal_order": (0,1,0,30)},
    {"order": (1,0,0), "seasonal_order": (1,1,0,30)},
    
    {"order": (2,0,1), "seasonal_order": (0,1,0,30)},
    {"order": (1,0,2), "seasonal_order": (0,1,0,30)},
    {"order": (1,0,1), "seasonal_order": (0,1,0,30)},
    
    {"order": (0,0,1), "seasonal_order": (1,1,0,30)},
    {"order": (2,0,1), "seasonal_order": (1,1,0,30)},
    {"order": (0,0,2), "seasonal_order": (0,1,1,30)},

    {"order": (0,0,1), "seasonal_order": (0,1,1,30)},
    {"order": (0,0,2), "seasonal_order": (0,1,1,30)},
    {"order": (1,0,1), "seasonal_order": (0,1,1,30)},
]

In [40]:
series = aantallen
splits = make_time_series_splits(series, initial_window=70, horizon=30, window_type="sliding")
cv_arima_candidates(series, kandidaatmodellen, splits)




LinAlgError: LU decomposition error.