In [75]:
!pip install pycaret



In [76]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import imageio
import os
from statsmodels.graphics.tsaplots import plot_acf
import pandas as pd
from pycaret.time_series import *

In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [102]:
# Cargar el archivo Excel
file_path = '/content/drive/MyDrive/Fundamentos/df_pivot_top10.xlsx'  # Cambia esto por la ruta de tu archivo
df = pd.read_excel(file_path)

# Ver las primeras filas del DataFrame para entender su estructura
print(df)

      mes_año  MLA1625102786  MLA1658680930  MLA875271085  MLA896621470  \
0  2022-05-01             81             16            18            23   
1  2022-06-01             59              4             6            14   
2  2022-07-01             65             47            25            25   
3  2022-08-01             72             87             4            36   
4  2022-09-01            142             60            17            32   
5  2022-10-01            265            189            31            47   
6  2022-11-01            349            213            52            50   
7  2022-12-01            386            284            44            58   
8  2023-01-01             71             85            32            83   
9  2023-02-01             83             52            30            59   
10 2023-03-01             62             71            47            94   
11 2023-04-01             54             85            34           110   
12 2023-05-01            

In [103]:
import plotly.express as px
fig = px.line(df, x="mes_año", y=["MLA896621681"], template = 'plotly_dark')
fig.show()

In [104]:
# extract month and year from dates**
df['Mes'] = [i.month for i in df['mes_año']]
df['Año'] = [i.year for i in df['mes_año']]

# create a sequence of numbers
df['Series'] = np.arange(1,len(df)+1)

# drop unnecessary columns and re-arrange
df.drop(['mes_año'], axis=1, inplace=True)
df = df[['Series', 'Año', 'Mes', 'MLA896621681']]

# check the head of the dataset**
df.head()


Unnamed: 0,Series,Año,Mes,MLA896621681
0,1,2022,5,85
1,2,2022,6,72
2,3,2022,7,95
3,4,2022,8,136
4,5,2022,9,168


In [105]:
# split data into train-test set
train = df[df['Año'] < 2024]
test = df[df['Año'] >= 2024]

# check shape
train.shape, test.shape

((20, 4), (8, 4))

In [106]:
print(train.info())
print(test.info())


<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Series        20 non-null     int64
 1   Año           20 non-null     int64
 2   Mes           20 non-null     int64
 3   MLA896621681  20 non-null     int64
dtypes: int64(4)
memory usage: 800.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 20 to 27
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Series        8 non-null      int64
 1   Año           8 non-null      int64
 2   Mes           8 non-null      int64
 3   MLA896621681  8 non-null      int64
dtypes: int64(4)
memory usage: 320.0 bytes
None


In [126]:
# import the regression module
from pycaret.regression import *


In [109]:
# Inicializar el setup sin mezclar los datos para series temporales
s = setup(data=train,
          test_data=test,
          target='MLA896621681',
          fold_strategy='timeseries',
          numeric_features=['Año', 'Mes', 'Series'],
          fold=3,
          transform_target=True,
          session_id=123,
          data_split_shuffle=False,  # Asegurarse de no mezclar los datos
          fold_shuffle=False)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,MLA896621681
2,Target type,Regression
3,Original data shape,"(28, 4)"
4,Transformed data shape,"(28, 4)"
5,Transformed train set shape,"(20, 4)"
6,Transformed test set shape,"(8, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


In [110]:
best = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,394.6,235147.9333,466.1711,-2.4398,0.7795,0.4403,0.06
gbr,Gradient Boosting Regressor,421.0957,252710.2044,475.0879,-2.9592,0.8172,0.4726,0.1
xgboost,Extreme Gradient Boosting,436.9766,271606.4245,494.7944,-3.354,0.8833,0.4916,0.0733
et,Extra Trees Regressor,438.3861,272900.1698,504.9152,-3.4195,0.8427,0.4826,0.1433
ada,AdaBoost Regressor,438.3888,272980.4363,501.1342,-3.4318,0.8622,0.4878,0.1167
rf,Random Forest Regressor,473.7092,295914.0529,529.6434,-3.851,0.9088,0.5158,0.3067
knn,K Neighbors Regressor,499.6336,315737.7462,550.3764,-3.8236,0.9736,0.5318,0.1033
par,Passive Aggressive Regressor,584.2192,467580.9265,646.8452,-7.5427,1.9412,0.6673,0.0533
en,Elastic Net,615.3607,573676.2588,686.3957,-8.9966,0.9238,0.5998,0.0633
lightgbm,Light Gradient Boosting Machine,755.5878,668780.8176,805.0677,-11.8041,1.4455,0.7393,0.0633


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [111]:
prediction_holdout = predict_model(best);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Decision Tree Regressor,1560.125,2503222.875,1582.1577,-35.1566,2.5593,17.4211


In [112]:
# generate predictions on the original dataset
predictions = predict_model(best, data=df)

# add a date column in the dataset**
predictions['mes_año'] = pd.date_range(start='2022-05-01', end = '2024-08-01', freq = 'MS')

print(predictions)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Decision Tree Regressor,445.75,715206.5357,845.6988,-1.6068,1.368,4.9774


    Series   Año  Mes  MLA896621681  prediction_label    mes_año
0        1  2022    5            85              85.0 2022-05-01
1        2  2022    6            72              72.0 2022-06-01
2        3  2022    7            95              95.0 2022-07-01
3        4  2022    8           136             136.0 2022-08-01
4        5  2022    9           168             168.0 2022-09-01
5        6  2022   10           654             654.0 2022-10-01
6        7  2022   11           742             742.0 2022-11-01
7        8  2022   12          1354            1354.0 2022-12-01
8        9  2023    1           359             359.0 2023-01-01
9       10  2023    2           408             408.0 2023-02-01
10      11  2023    3           623             623.0 2023-03-01
11      12  2023    4           820             820.0 2023-04-01
12      13  2023    5           999             999.0 2023-05-01
13      14  2023    6           956             956.0 2023-06-01
14      15  2023    7    

In [113]:
print(test)

    Series   Año  Mes  MLA896621681
20      21  2024    1           862
21      22  2024    2           511
22      23  2024    3           160
23      24  2024    4            37
24      25  2024    5            34
25      26  2024    6           173
26      27  2024    7           189
27      28  2024    8           377


In [114]:
# line plot
fig = px.line(predictions, x='mes_año', y=["MLA896621681", "prediction_label"], template = 'plotly_dark')

# add a vertical rectange for test-set separation**
fig.add_vrect(x0="2024-01-01", x1="2024-08-01", fillcolor="grey", opacity=0.25, line_width=0)
fig.show()


In [115]:
final_best = finalize_model(best)

In [121]:
future_dates = pd.date_range(start = '2024-09-01', end = '2024-11-01', freq = 'MS')

future_df = pd.DataFrame()

future_df['Mes'] = [i.month for i in future_dates]
future_df['Año'] = [i.year for i in future_dates]
future_df['Series'] = np.arange(28, 28 + len(future_dates))
future_df.head()



Unnamed: 0,Mes,Año,Series
0,9,2024,28
1,10,2024,29
2,11,2024,30


In [122]:
predictions_future = predict_model(final_best, data=future_df)
predictions_future.head()

Unnamed: 0,Mes,Año,Series,prediction_label
0,9,2024,28,377.0
1,10,2024,29,377.0
2,11,2024,30,377.0


In [125]:
concat_df = pd.concat([df,predictions_future], axis=0)
concat_df_i = pd.date_range(start='2022-05-01', end = '2024-11-01', freq = 'MS')
concat_df.set_index(concat_df_i, inplace=True)
fig = px.line(concat_df, x=concat_df.index, y=["MLA896621681", "prediction_label"], template = 'plotly_dark')
fig.show()