In [38]:
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from __future__ import division

In [39]:
import warnings
warnings.filterwarnings("ignore")

In [40]:
import plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go

In [41]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam 
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers import LSTM
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [42]:
df_sales = pd.read_csv('/Users/lfarias/Desktop/Renner/Renner-ML/input/clean_data/renner_web_treino.csv')

In [43]:
df_sales['data_semana_comercial'] = pd.to_datetime(df_sales['data_semana_comercial'])

In [44]:
#represent month in date field as its first day
df_sales['data_semana_comercial'] = df_sales['data_semana_comercial'].dt.year.astype('str') + '-' + df_sales['data_semana_comercial'].dt.month.astype('str') + '-01'
df_sales['data_semana_comercial'] = pd.to_datetime(df_sales['data_semana_comercial'])

In [45]:
#groupby date and sum the sales
df_sales = df_sales.groupby('data_semana_comercial').venda.sum().reset_index()

In [46]:
df_sales.head()

Unnamed: 0,data_semana_comercial,venda
0,2017-01-01,66
1,2017-02-01,155
2,2017-03-01,232
3,2017-04-01,783
4,2017-05-01,804


In [47]:
#plot monthly sales
plot_data = [
    go.Scatter(
        x=df_sales['data_semana_comercial'],
        y=df_sales['venda'],
    )
]

plot_layout = go.Layout(
        title='Montly Sales'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [63]:
#create a new dataframe to model the difference
df_diff = df_sales.copy()

In [64]:
#add previous sales to the next row
df_diff['prev_sales'] = df_diff['venda'].shift(1)

In [65]:
df_diff.head()

Unnamed: 0,data_semana_comercial,venda,prev_sales
0,2017-01-01,66,
1,2017-02-01,155,66.0
2,2017-03-01,232,155.0
3,2017-04-01,783,232.0
4,2017-05-01,804,783.0


In [66]:
#drop the null values and calculate the difference
df_diff = df_diff.dropna()

In [68]:
df_diff['diff'] = (df_diff['venda'] - df_diff['prev_sales'])

In [69]:
df_diff.head(10)

Unnamed: 0,data_semana_comercial,venda,prev_sales,diff
1,2017-02-01,155,66.0,89.0
2,2017-03-01,232,155.0,77.0
3,2017-04-01,783,232.0,551.0
4,2017-05-01,804,783.0,21.0
5,2017-06-01,432,804.0,-372.0
6,2017-07-01,950,432.0,518.0
7,2017-08-01,607,950.0,-343.0
8,2017-09-01,293,607.0,-314.0
9,2017-10-01,569,293.0,276.0
10,2017-11-01,485,569.0,-84.0


In [70]:
#plot sales diff
plot_data = [
    go.Scatter(
        x=df_diff['data_semana_comercial'],
        y=df_diff['diff'],
    )
]

plot_layout = go.Layout(
        title='Montly Sales Diff'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [71]:
#create new dataframe from transformation from time series to supervised
df_supervised = df_diff.drop(['prev_sales'],axis=1)

In [72]:
#adding lags
for inc in range(1,13):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff'].shift(inc)

In [73]:
df_supervised.head(10)

Unnamed: 0,data_semana_comercial,venda,diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12
1,2017-02-01,155,89.0,,,,,,,,,,,,
2,2017-03-01,232,77.0,89.0,,,,,,,,,,,
3,2017-04-01,783,551.0,77.0,89.0,,,,,,,,,,
4,2017-05-01,804,21.0,551.0,77.0,89.0,,,,,,,,,
5,2017-06-01,432,-372.0,21.0,551.0,77.0,89.0,,,,,,,,
6,2017-07-01,950,518.0,-372.0,21.0,551.0,77.0,89.0,,,,,,,
7,2017-08-01,607,-343.0,518.0,-372.0,21.0,551.0,77.0,89.0,,,,,,
8,2017-09-01,293,-314.0,-343.0,518.0,-372.0,21.0,551.0,77.0,89.0,,,,,
9,2017-10-01,569,276.0,-314.0,-343.0,518.0,-372.0,21.0,551.0,77.0,89.0,,,,
10,2017-11-01,485,-84.0,276.0,-314.0,-343.0,518.0,-372.0,21.0,551.0,77.0,89.0,,,


In [74]:
df_supervised.tail(6)

Unnamed: 0,data_semana_comercial,venda,diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12
62,2022-03-01,2160,900.0,-354.0,576.0,-1599.0,170.0,1723.0,-2499.0,-1047.0,638.0,-3174.0,2311.0,1956.0,413.0
63,2022-04-01,2796,636.0,900.0,-354.0,576.0,-1599.0,170.0,1723.0,-2499.0,-1047.0,638.0,-3174.0,2311.0,1956.0
64,2022-05-01,5292,2496.0,636.0,900.0,-354.0,576.0,-1599.0,170.0,1723.0,-2499.0,-1047.0,638.0,-3174.0,2311.0
65,2022-06-01,2886,-2406.0,2496.0,636.0,900.0,-354.0,576.0,-1599.0,170.0,1723.0,-2499.0,-1047.0,638.0,-3174.0
66,2022-07-01,3160,274.0,-2406.0,2496.0,636.0,900.0,-354.0,576.0,-1599.0,170.0,1723.0,-2499.0,-1047.0,638.0
67,2022-08-01,3449,289.0,274.0,-2406.0,2496.0,636.0,900.0,-354.0,576.0,-1599.0,170.0,1723.0,-2499.0,-1047.0


In [75]:
#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)

In [76]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf 

# Define the regression formula
model = smf.ols(formula='diff ~ lag_1', data=df_supervised)

# Fit the regression
model_fit = model.fit()

# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

-0.0023416393662498702


In [77]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf 

# Define the regression formula
model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3 + lag_4 + lag_5', data=df_supervised)

# Fit the regression
model_fit = model.fit()

# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

0.12350578539817958


In [78]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf 

# Define the regression formula
model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3 + lag_4 + lag_5 + lag_6 + lag_7 + lag_8 + lag_9 + lag_10 + lag_11 + lag_12', data=df_supervised)

# Fit the regression
model_fit = model.fit()

# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

0.15238555696271816


In [80]:
#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler
df_model = df_supervised.drop(['venda','data_semana_comercial'],axis=1)

In [81]:
#split train and test set
train_set, test_set = df_model[0:-6].values, df_model[-6:].values

In [82]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   diff    55 non-null     float64
 1   lag_1   55 non-null     float64
 2   lag_2   55 non-null     float64
 3   lag_3   55 non-null     float64
 4   lag_4   55 non-null     float64
 5   lag_5   55 non-null     float64
 6   lag_6   55 non-null     float64
 7   lag_7   55 non-null     float64
 8   lag_8   55 non-null     float64
 9   lag_9   55 non-null     float64
 10  lag_10  55 non-null     float64
 11  lag_11  55 non-null     float64
 12  lag_12  55 non-null     float64
dtypes: float64(13)
memory usage: 5.7 KB


In [83]:
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)

# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

In [84]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])

In [85]:
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [86]:
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, nb_epoch=100, batch_size=1, verbose=1, shuffle=False)

2023-06-06 19:30:42.236713: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-06 19:30:42.246380: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-06 19:30:42.248823: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/r_/502n_mrx325cyl3pchdhn5380000gn/T/ipykernel_41784/642241112.py", line 5, in <module>
    model.fit(X_train, y_train, nb_epoch=100, batch_size=1, verbose=1, shuffle=False)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
    filtered_tb = _process_traceback_frames(e.__traceback__)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: Model.fit() got an unexpected keyword argument 'nb_epoch'

During handling of the above except

In [87]:
y_pred = model.predict(X_test,batch_size=1)

2023-06-06 19:31:04.760866: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-06 19:31:04.768550: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-06 19:31:04.775373: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [88]:
y_pred

array([[0.04974004],
       [0.06915949],
       [0.05891997],
       [0.08079572],
       [0.06340417],
       [0.10682566]], dtype=float32)

In [89]:
y_test

array([[ 0.17451365],
       [ 0.15566661],
       [ 0.28845261],
       [-0.06150277],
       [ 0.12982331],
       [ 0.13089416]])

In [90]:
#reshape y_pred
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])

In [92]:
#rebuild test set for inverse transform
pred_test_set = []
for index in range(0,len(y_pred)):
    print(np.concatenate([y_pred[index],X_test[index]],axis=1))
    pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))

[[ 0.04974004  0.08499018  0.15138319 -0.00389077  0.12239871  0.23326789
  -0.06814207  0.03551669  0.15580939 -0.11633054  0.2752454   0.24990184
   0.13974656]]
[[ 0.06915949  0.17451365  0.08499018  0.15138319 -0.00389077  0.12239871
   0.23326789 -0.06814207  0.03551669  0.15580939 -0.11633054  0.2752454
   0.24990184]]
[[ 0.05891997  0.15566661  0.17451365  0.08499018  0.15138319 -0.00389077
   0.12239871  0.23326789 -0.06814207  0.03551669  0.15580939 -0.11633054
   0.2752454 ]]
[[ 0.08079572  0.28845261  0.15566661  0.17451365  0.08499018  0.15138319
  -0.00389077  0.12239871  0.23326789 -0.06814207  0.03551669  0.15580939
  -0.11633054]]
[[ 0.06340417 -0.06150277  0.28845261  0.15566661  0.17451365  0.08499018
   0.15138319 -0.00389077  0.12239871  0.23326789 -0.06814207  0.03551669
   0.15580939]]
[[ 0.10682566  0.12982331 -0.06150277  0.28845261  0.15566661  0.17451365
   0.08499018  0.15138319 -0.00389077  0.12239871  0.23326789 -0.06814207
   0.03551669]]


In [93]:
pred_test_set[0]

array([[ 0.04974004,  0.08499018,  0.15138319, -0.00389077,  0.12239871,
         0.23326789, -0.06814207,  0.03551669,  0.15580939, -0.11633054,
         0.2752454 ,  0.24990184,  0.13974656]])

In [94]:
#reshape pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])

In [95]:
#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)

In [99]:
#create dataframe that shows the predicted sales
result_list = []
sales_dates = list(df_sales[-7:].data_semana_comercial)
act_sales = list(df_sales[-7:].venda)
for index in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act_sales[index])
    result_dict['data_semana_comercial'] = sales_dates[index+1]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [100]:
df_result

Unnamed: 0,pred_value,data_semana_comercial
0,412,2022-03-01
1,1584,2022-04-01
2,2076,2022-05-01
3,4879,2022-06-01
4,2229,2022-07-01
5,3111,2022-08-01


In [101]:
df_sales.head()

Unnamed: 0,data_semana_comercial,venda
0,2017-01-01,66
1,2017-02-01,155
2,2017-03-01,232
3,2017-04-01,783
4,2017-05-01,804


In [102]:
#merge with actual sales dataframe
df_sales_pred = pd.merge(df_sales,df_result,on='data_semana_comercial',how='left')

In [107]:
df_sales_pred.tail(10)

Unnamed: 0,data_semana_comercial,venda,pred_value
58,2021-11-01,2637,
59,2021-12-01,1038,
60,2022-01-01,1614,
61,2022-02-01,1260,
62,2022-03-01,2160,412.0
63,2022-04-01,2796,1584.0
64,2022-05-01,5292,2076.0
65,2022-06-01,2886,4879.0
66,2022-07-01,3160,2229.0
67,2022-08-01,3449,3111.0


In [105]:
#plot actual and predicted
plot_data = [
    go.Scatter(
        x=df_sales_pred['data_semana_comercial'],
        y=df_sales_pred['venda'],
        name='Atual'
    ),
        go.Scatter(
        x=df_sales_pred['data_semana_comercial'],
        y=df_sales_pred['pred_value'],
        name='Predito'
    )
    
]

plot_layout = go.Layout(
        title='Predição de Vendas'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)