# 0. Import Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df= pd.read_csv(r"C:\Users\alexs\Documents\TFM_MBD\TFM_MBD_2024_AlexSerra\preprocessed_data\other_europe.csv")

# 1. Defining train, validation, test

In [3]:
train_lim = int(df.shape[0]*0.7)
df_train = df.iloc[:train_lim]
df_test = df.iloc[train_lim:]

In [4]:
train_lim_meta = int(df_train.shape[0]*0.6)
df_train_meta = df_train.iloc[:train_lim_meta]
df_validation_meta = df_train.iloc[train_lim_meta:]

Let's define general datasets:

- df_train --> data that we are going to use to train our algorithms (prophet, autoarima, ....)
- df_test --> data that only will be used to evaluate scores of how the algorithm work.

Now, we also defined "meta" dataset. This data will be used to train the meta learner:

- df_train_meta --> son algorithm (prophet, autoarima,...) will be trained with this reduced amount of the train dataset.
- df_validation_meta --> this data will be used, after having son algorithms trained, to train the meta learner. For example, for a linear regression, it will be used to obtain the coefficients.

Once we have the metalearner trained, son algorithms will be trained with full df_train. And then using the metalearner to obtain final predictions.

# 2. Prophet

In [5]:
from prophet import Prophet

In [6]:
model_prophet = Prophet()

In [7]:
model_prophet.fit(df_train)

18:53:09 - cmdstanpy - INFO - Chain [1] start processing
18:53:09 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x2b8b0135190>

In [8]:
forecast_prophet = model_prophet.predict(df_test[["ds"]])["yhat"].values

In [9]:
df_final_predictions= pd.DataFrame()

In [10]:
df_final_predictions["ds"] = df_test["ds"]

In [11]:
df_final_predictions ["prophet"] = forecast_prophet

In [12]:
def calculate_mape(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    return np.mean(np.abs((actual - predicted) / actual)) * 100

In [13]:
mape=calculate_mape(df_test["y"],forecast_prophet)

print(f'MAPE: {mape:.2f}%')

MAPE: 21.35%


# 3. Prophet adding regressors

In [14]:
model_prophet_multiva = Prophet()

In [15]:
for e in df_train.columns[2:]:
    model_prophet_multiva.add_regressor(e)

In [16]:
model_prophet_multiva.fit(df_train)

18:53:09 - cmdstanpy - INFO - Chain [1] start processing
18:53:10 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x2b8b03e3c10>

In [17]:
forecast_prophet_multiva = model_prophet_multiva.predict(df_test.drop(columns="y"))["yhat"].values

In [18]:
mape=calculate_mape(df_test["y"],forecast_prophet_multiva)

print(f'MAPE: {mape:.2f}%')

MAPE: 21.87%


In [19]:
df_final_predictions["prophet_multiva"] = forecast_prophet_multiva

# 4. Autoarima

In [20]:
from pmdarima import auto_arima

In [21]:
df_train_arima = df_train.copy()
df_test_arima = df_test.copy()

df_train_arima.set_index('ds', inplace=True)
df_train_arima = df_train_arima[["y"]]

df_test_arima.set_index('ds', inplace=True)
df_test_arima = df_test_arima[["y"]]

In [22]:
# Train AutoARIMA model
model_autoarima = auto_arima(df_train_arima, 
                   seasonal=True,  # Change to True if you want to fit a seasonal ARIMA

                   stepwise=True,   # Set to False to perform a more exhaustive search
                   trace=True)      # Set to True to see the search progress

# Print the best model parameters
print(model_autoarima.summary())

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=268.565, Time=0.13 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=325.069, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=278.007, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=292.544, Time=0.03 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=646.754, Time=0.01 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=267.768, Time=0.08 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=290.754, Time=0.04 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=265.856, Time=0.07 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=267.825, Time=0.16 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=274.014, Time=0.08 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=272.651, Time=0.04 sec

Best model:  ARIMA(1,0,1)(0,0,0)[0] intercept
Total fit time: 0.740 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  128
Model: 

In [23]:
forecast_autoarima = model_autoarima.predict(n_periods=df_test_arima.shape[0]).values

In [24]:
df_final_predictions["autoarima"] = forecast_autoarima

In [25]:
mape=calculate_mape(df_test["y"],forecast_autoarima)

print(f'MAPE: {mape:.2f}%')

MAPE: 21.07%


# 5. Ensembling (preparing data)

In [26]:
model_prophet_meta = Prophet()

model_prophet_meta.fit(df_train_meta)

forecast_prophet_meta = model_prophet_meta.predict(df_validation_meta[["ds"]])["yhat"].values

df_meta= pd.DataFrame()

df_meta["ds"] = df_validation_meta["ds"]

df_meta ["prophet"] = forecast_prophet_meta

18:53:12 - cmdstanpy - INFO - Chain [1] start processing
18:53:12 - cmdstanpy - INFO - Chain [1] done processing


In [27]:
model_prophet_multiva_meta = Prophet()

for e in df_train_meta.columns[2:]:
    model_prophet_multiva_meta.add_regressor(e)

model_prophet_multiva_meta.fit(df_train_meta)

forecast_prophet_multiva_meta = model_prophet_multiva_meta.predict(df_validation_meta.drop(columns="y"))["yhat"].values

df_meta["prophet_multiva"] = forecast_prophet_multiva_meta

18:53:12 - cmdstanpy - INFO - Chain [1] start processing
18:53:12 - cmdstanpy - INFO - Chain [1] done processing


In [28]:
df_train_arima_meta = df_train_meta.copy()
df_test_arima_meta = df_validation_meta.copy()

df_train_arima_meta.set_index('ds', inplace=True)
df_train_arima_meta = df_train_arima_meta[["y"]]

df_test_arima_meta.set_index('ds', inplace=True)
df_test_arima_meta = df_test_arima_meta[["y"]]

# Train AutoARIMA model
model_autoarima_meta = auto_arima(df_train_arima, 
                   seasonal=True,  # Change to True if you want to fit a seasonal ARIMA

                   stepwise=True,   # Set to False to perform a more exhaustive search
                   trace=True)      # Set to True to see the search progress

# Print the best model parameters
print(model_autoarima_meta.summary())

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=268.565, Time=0.12 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=325.069, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=278.007, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=292.544, Time=0.03 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=646.754, Time=0.01 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=267.768, Time=0.08 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=290.754, Time=0.04 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=265.856, Time=0.06 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=267.825, Time=0.16 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=274.014, Time=0.07 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=272.651, Time=0.04 sec

Best model:  ARIMA(1,0,1)(0,0,0)[0] intercept
Total fit time: 0.713 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  128
Model: 

In [29]:
forecast_autoarima_meta = model_autoarima_meta.predict(n_periods=df_validation_meta.shape[0]).values

df_meta["autoarima"] = forecast_autoarima_meta

In [30]:
df_meta["average_predictions"] = df_meta[['prophet', 'prophet_multiva', 'autoarima']].mean(axis=1)

In [31]:
df_meta["real_values"] = df_validation_meta["y"]

In [32]:
mape=calculate_mape(df_test["y"],forecast_autoarima)

print(f'MAPE: {mape:.2f}%')

MAPE: 21.07%


In [33]:
for e in df_meta.columns[1:-1]:
    mape=calculate_mape(df_meta["real_values"],df_meta[e])
    print(f'MAPE of {e}: {mape:.2f}%')

MAPE of prophet: 27.26%
MAPE of prophet_multiva: 18.22%
MAPE of autoarima: 21.16%
MAPE of average_predictions: 20.45%


# 6. Ensembling (Training Linear Regresion)

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
df_meta = df_meta.reset_index(drop=True)

In [42]:
lr = LinearRegression()

In [51]:
lim=int(df_meta.shape[0]*0.7)
X_train = df_meta[["prophet", "prophet_multiva", "autoarima"]].iloc[:lim]
y_train = df_meta [["real_values"]].iloc[:lim]

X_test=df_meta[["prophet", "prophet_multiva", "autoarima"]].iloc[lim:]
y_test = df_meta [["real_values"]].iloc[lim:]

In [53]:
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [54]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.3515840287877502
R^2 Score: 0.23368631676424279


In [55]:
y_pred = lr.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.39001008221362243
R^2 Score: 0.38343398507475457


# 7. Final Predictions

In [57]:
df_final_predictions["average_predictions"] = df_final_predictions[['prophet', 'prophet_multiva', 'autoarima']].mean(axis=1)

In [59]:
testing_X = df_final_predictions[['prophet', 'prophet_multiva', 'autoarima']]

In [None]:
# We decide to train the algorithm with full data of the validation

In [72]:
X_train = df_meta[["prophet", "prophet_multiva", "autoarima"]]
y_train = df_meta [["real_values"]]
lr.fit(X_train, y_train)

In [73]:
lr_predictions = lr.predict(testing_X)

In [74]:
df_final_predictions["lr_predictions"] = lr_predictions

In [75]:
print('Coefficients:', lr.coef_)
print('Intercept:', lr.intercept_)

Coefficients: [[8.6365826  0.39039091 8.07040303]]
Intercept: [-41.78173911]


In [76]:
for e in df_final_predictions.columns[1:]:
    mape=calculate_mape(df_test["y"],df_final_predictions[e])
    print(f'MAPE of {e}: {mape:.2f}%')

MAPE of prophet: 21.35%
MAPE of prophet_multiva: 21.87%
MAPE of autoarima: 21.07%
MAPE of average_predictions: 17.84%
MAPE of lr_predictions: 289.60%


In [78]:
df_final_predictions

Unnamed: 0,ds,prophet,prophet_multiva,autoarima,average_predictions,lr_predictions
128,2022-06-20,2.643313,2.753302,2.579606,2.65874,2.940777
129,2022-06-27,2.670553,2.420241,2.613922,2.568239,3.32295
130,2022-07-04,2.72076,2.702383,2.644592,2.689245,4.114238
131,2022-07-11,2.700996,3.220187,2.672004,2.864396,4.366915
132,2022-07-18,2.586916,3.249357,2.696504,2.844259,3.590764
133,2022-07-25,2.456163,2.668714,2.718401,2.614426,2.411554
134,2022-08-01,2.41395,2.820286,2.737972,2.657403,2.264093
135,2022-08-08,2.502929,3.168919,2.755464,2.809104,3.309835
136,2022-08-15,2.692913,3.286053,2.771098,2.916688,5.122545
137,2022-08-22,2.940951,2.914187,2.78507,2.880069,7.23234
