# 0. Import Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df= pd.read_csv(r"C:\Users\alexs\Documents\TFM_MBD\TFM_MBD_2024_AlexSerra\preprocessed_data\other_europe.csv")

# 1. Defining train, validation, test

In [3]:
train_lim = int(df.shape[0]*0.7)
df_train = df.iloc[:train_lim]
df_test = df.iloc[train_lim:]

In [4]:
train_lim_meta = int(df_train.shape[0]*0.6)
df_train_meta = df_train.iloc[:train_lim_meta]
df_validation_meta = df_train.iloc[train_lim_meta:]

Let's define general datasets:

- df_train --> data that we are going to use to train our algorithms (prophet, autoarima, ....)
- df_test --> data that only will be used to evaluate scores of how the algorithm work.

Now, we also defined "meta" dataset. This data will be used to train the meta learner:

- df_train_meta --> son algorithm (prophet, autoarima,...) will be trained with this reduced amount of the train dataset.
- df_validation_meta --> this data will be used, after having son algorithms trained, to train the meta learner. For example, for a linear regression, it will be used to obtain the coefficients.

Once we have the metalearner trained, son algorithms will be trained with full df_train. And then using the metalearner to obtain final predictions.

# 2. Prophet

In [5]:
from prophet import Prophet

In [6]:
model_prophet = Prophet()

In [7]:
model_prophet.fit(df_train)

16:13:09 - cmdstanpy - INFO - Chain [1] start processing
16:13:09 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x21bdd116340>

In [8]:
forecast_prophet = model_prophet.predict(df_test[["ds"]])["yhat"].values

In [9]:
df_final_predictions= pd.DataFrame()

In [10]:
df_final_predictions["ds"] = df_test["ds"]

In [11]:
df_final_predictions ["prophet"] = forecast_prophet

In [12]:
def calculate_mape(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    return np.mean(np.abs((actual - predicted) / actual)) * 100

In [13]:
mape=calculate_mape(df_test["y"],forecast_prophet)

print(f'MAPE: {mape:.2f}%')

MAPE: 21.35%


In [14]:
def prophet_predictions(df_tr, df_te):
    # df_tr should contain ds and y columns
    # df_te should contain ds column.
    m = Prophet()
    m.fit(df_tr)
    fore = m.predict(df_te[["ds"]])["yhat"].values
    return fore

# 3. Prophet adding regressors

In [15]:
model_prophet_multiva = Prophet()

In [16]:
for e in df_train.columns[2:]:
    model_prophet_multiva.add_regressor(e)

In [17]:
model_prophet_multiva.fit(df_train)

16:13:09 - cmdstanpy - INFO - Chain [1] start processing
16:13:10 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x21bdd3c2190>

In [18]:
forecast_prophet_multiva = model_prophet_multiva.predict(df_test.drop(columns="y"))["yhat"].values

In [19]:
mape=calculate_mape(df_test["y"],forecast_prophet_multiva)

print(f'MAPE: {mape:.2f}%')

MAPE: 21.87%


In [20]:
df_final_predictions["prophet_multiva"] = forecast_prophet_multiva

In [21]:
def prophet_predictions_multiva(df_tr, df_te):
    # df_tr should contain ds and y columns
    # df_te should contain ds column.
    m = Prophet()
    for e in df_tr.columns[2:]:
        m.add_regressor(e)
    m.fit(df_tr)
    fore = m.predict(df_te.drop(columns="y"))["yhat"].values
    return fore

# 4. Autoarima

In [22]:
from pmdarima import auto_arima

In [23]:
df_train_arima = df_train.copy()
df_test_arima = df_test.copy()

df_train_arima.set_index('ds', inplace=True)
df_train_arima = df_train_arima[["y"]]

df_test_arima.set_index('ds', inplace=True)
df_test_arima = df_test_arima[["y"]]

In [24]:
# Train AutoARIMA model
model_autoarima = auto_arima(df_train_arima, 
                   seasonal=True,  # Change to True if you want to fit a seasonal ARIMA

                   stepwise=True,   # Set to False to perform a more exhaustive search
                   trace=True)      # Set to True to see the search progress

# Print the best model parameters
print(model_autoarima.summary())

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=268.565, Time=0.13 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=325.069, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=278.007, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=292.544, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=646.754, Time=0.01 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=267.768, Time=0.10 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=290.754, Time=0.06 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=265.856, Time=0.09 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=267.825, Time=0.20 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=274.014, Time=0.09 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=272.651, Time=0.06 sec

Best model:  ARIMA(1,0,1)(0,0,0)[0] intercept
Total fit time: 0.896 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  128
Model: 

In [25]:
forecast_autoarima = model_autoarima.predict(n_periods=df_test_arima.shape[0]).values

In [26]:
df_final_predictions["autoarima"] = forecast_autoarima

In [27]:
mape=calculate_mape(df_test["y"],forecast_autoarima)

print(f'MAPE: {mape:.2f}%')

MAPE: 21.07%


In [28]:
def autoarima_predictions(df_tr, df_te):
    df_tr1=df_tr.copy()
    df_tr1.set_index('ds', inplace=True)
    df_tr1 = df_tr1[["y"]]
    
    df_te1 = df_te.copy()
    df_te1.set_index('ds', inplace=True)
    df_te1 = df_te1[["y"]]
    
    m_autoarima = auto_arima(df_tr1, 
                   seasonal=True,  # Change to True if you want to fit a seasonal ARIMA

                   stepwise=True,   # Set to False to perform a more exhaustive search
                   trace=False)      # Set to True to see the search progress
    
    fore = m_autoarima.predict(n_periods=df_te1.shape[0]).values
    return fore

# 5. Trying Timeseries test split

In [37]:
df_train

Unnamed: 0,ds,y,humidity,temp,gtrends_allergie,gtrends_pollen allergie,gtrends_heuschnupfen,gtrends_pollen,influenza,pollution_Zurich_no2,pollution_Zurich_o3,pollution_Zurich_pm10,pollution_Zurich_pm25,pollution_Zurich_so2
0,2020-01-06,3.506025,81.875606,274.946388,115,0,8,22,221.0,13.942857,5.657143,15.428571,3.000000,0.500000
1,2020-01-13,3.898515,77.805608,275.982323,176,0,36,39,3.0,16.328571,5.485714,12.285714,3.714286,0.885714
2,2020-01-20,4.174013,73.820989,275.575986,140,0,5,27,530.5,14.828571,11.000000,10.714286,2.571429,0.914286
3,2020-01-27,3.568473,80.112456,274.589907,150,0,5,83,911.5,18.314286,2.685714,24.142857,5.000000,0.785714
4,2020-02-03,3.507797,79.792778,279.608504,98,0,32,10,772.0,6.985714,20.357143,6.428571,1.285714,0.371429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,2022-05-16,1.877216,65.656746,292.554001,130,160,210,441,17.0,5.242857,28.657143,13.571429,1.714286,0.300000
124,2022-05-23,2.989111,63.478175,294.610013,134,173,232,326,0.0,5.014286,28.842857,13.428571,1.857143,0.314286
125,2022-05-30,3.002538,64.182540,289.707639,123,43,179,195,10.0,3.385714,28.628571,7.857143,1.000000,0.200000
126,2022-06-06,2.186763,74.434524,292.106534,151,11,70,130,0.0,4.800000,23.842857,11.714286,1.571429,0.257143


In [38]:
from sklearn.model_selection import TimeSeriesSplit

In [49]:
tscv = TimeSeriesSplit()

l_index = []
l_prophet = []
l_prophet_multiva = []
l_autoarima = []
for i, (train_index, test_index) in enumerate(tscv.split(df_train)):
    print(f"Fold {i}:")
    
    train = df_train.iloc[train_index]
    test = df_train.iloc[test_index]
    
    l_prophet.extend(prophet_predictions(train, test))
    l_prophet_multiva.extend(prophet_predictions_multiva(train, test))
    l_autoarima.extend(autoarima_predictions(train, test))
    
    l_index.extend(test_index)

16:22:13 - cmdstanpy - INFO - Chain [1] start processing


Fold 0:


16:22:13 - cmdstanpy - INFO - Chain [1] done processing
16:22:13 - cmdstanpy - INFO - Chain [1] start processing
16:22:14 - cmdstanpy - INFO - Chain [1] done processing
16:22:14 - cmdstanpy - INFO - Chain [1] start processing


Fold 1:


16:22:14 - cmdstanpy - INFO - Chain [1] done processing
16:22:14 - cmdstanpy - INFO - Chain [1] start processing
16:22:14 - cmdstanpy - INFO - Chain [1] done processing
16:22:15 - cmdstanpy - INFO - Chain [1] start processing


Fold 2:


16:22:15 - cmdstanpy - INFO - Chain [1] done processing
16:22:15 - cmdstanpy - INFO - Chain [1] start processing
16:22:16 - cmdstanpy - INFO - Chain [1] done processing
16:22:16 - cmdstanpy - INFO - Chain [1] start processing


Fold 3:


16:22:16 - cmdstanpy - INFO - Chain [1] done processing
16:22:17 - cmdstanpy - INFO - Chain [1] start processing
16:22:17 - cmdstanpy - INFO - Chain [1] done processing
16:22:19 - cmdstanpy - INFO - Chain [1] start processing
16:22:19 - cmdstanpy - INFO - Chain [1] done processing


Fold 4:


16:22:19 - cmdstanpy - INFO - Chain [1] start processing
16:22:19 - cmdstanpy - INFO - Chain [1] done processing


In [51]:
df_timeseries = pd.DataFrame()
df_timeseries["ds"] = df_train.iloc[l_index]["ds"]
df_timeseries.index = l_index
df_timeseries["prophet"] = l_prophet
df_timeseries["prophet_multiva"] = l_prophet_multiva
df_timeseries["autoarima"] = l_autoarima
df_timeseries["real_values"] = df_train.iloc[l_index]["y"]

In [59]:
for e in df_timeseries.columns[1:]:
    mape=calculate_mape(df_timeseries["real_values"],df_timeseries[e])
    print(f'MAPE of {e}: {mape:.2f}%')

MAPE of prophet: 33.96%
MAPE of prophet_multiva: 52.61%
MAPE of autoarima: 27.85%
MAPE of real_values: 0.00%


In [60]:
tscv = TimeSeriesSplit(n_splits=3)

l_index = []
l_prophet = []
l_prophet_multiva = []
l_autoarima = []
for i, (train_index, test_index) in enumerate(tscv.split(df_train)):
    print(f"Fold {i}:")
    
    train = df_train.iloc[train_index]
    test = df_train.iloc[test_index]
    
    l_prophet.extend(prophet_predictions(train, test))
    l_prophet_multiva.extend(prophet_predictions_multiva(train, test))
    l_autoarima.extend(autoarima_predictions(train, test))
    
    l_index.extend(test_index)

df_timeseries = pd.DataFrame()
df_timeseries["ds"] = df_train.iloc[l_index]["ds"]
df_timeseries.index = l_index
df_timeseries["prophet"] = l_prophet
df_timeseries["prophet_multiva"] = l_prophet_multiva
df_timeseries["autoarima"] = l_autoarima
df_timeseries["real_values"] = df_train.iloc[l_index]["y"]

for e in df_timeseries.columns[1:]:
    mape=calculate_mape(df_timeseries["real_values"],df_timeseries[e])
    print(f'MAPE of {e}: {mape:.2f}%')

Fold 0:


16:52:06 - cmdstanpy - INFO - Chain [1] start processing
16:52:06 - cmdstanpy - INFO - Chain [1] done processing
16:52:06 - cmdstanpy - INFO - Chain [1] start processing
16:52:06 - cmdstanpy - INFO - Chain [1] done processing
16:52:07 - cmdstanpy - INFO - Chain [1] start processing


Fold 1:


16:52:07 - cmdstanpy - INFO - Chain [1] done processing
16:52:07 - cmdstanpy - INFO - Chain [1] start processing
16:52:07 - cmdstanpy - INFO - Chain [1] done processing
16:52:08 - cmdstanpy - INFO - Chain [1] start processing


Fold 2:


16:52:08 - cmdstanpy - INFO - Chain [1] done processing
16:52:08 - cmdstanpy - INFO - Chain [1] start processing
16:52:09 - cmdstanpy - INFO - Chain [1] done processing


MAPE of prophet: 47.53%
MAPE of prophet_multiva: 49.66%
MAPE of autoarima: 27.08%
MAPE of real_values: 0.00%
