### 1. Importing all the libraries
### Model used - Random Forest Regressor

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor


!pip install plotly
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots



### 2. Read the CSV

In [2]:
df_train = pd.read_csv('train_preprocessed')

  df_train = pd.read_csv('train_preprocessed')


In [3]:
df_test = pd.read_csv('test_preprocessed')

  df_test = pd.read_csv('test_preprocessed')


In [4]:
forecasting_df = pd.read_csv('forecasting_df')

In [5]:
df_train.isna().any()

Store                        False
DayOfWeek                    False
Date                         False
Sales                        False
Customers                    False
Open                         False
Promo                        False
StateHoliday                 False
SchoolHoliday                False
StoreType                    False
Assortment                   False
CompetitionDistance          False
CompetitionOpenSinceMonth    False
CompetitionOpenSinceYear     False
Promo2                       False
Promo2SinceWeek              False
Promo2SinceYear              False
PromoInterval                False
sales_lag7                    True
sales_lag12                   True
sales_lag14                   True
sales_lag24                   True
encoded_Assortment            True
encoded_StoreType             True
encoded_PromoInterval         True
dtype: bool

### 3.1 Model Running (Random Forest Regressor)

In [6]:
# Convert 'Date' column to datetime format
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_test['Date'] = pd.to_datetime(df_test['Date'])

# set 'sales' as target variable
y_train = df_train['Sales']
X_train = df_train.drop('Sales', axis=1)

y_test = df_test['Sales']
X_test = df_test.drop('Sales', axis=1)

In [7]:
# Select the columns
X_train_all = df_train[['sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','encoded_StoreType', 'encoded_Assortment','Promo']].dropna()
y_train = df_train.loc[X_train_all.index, 'Sales']
X_test_all = df_test[['sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','encoded_StoreType', 'encoded_Assortment','Promo']].dropna()
y_test = df_test.loc[X_test_all.index, 'Sales']

# Initialize the model
model = RandomForestRegressor(n_estimators=100, max_features=0.5, random_state=42)
# Fit the model
model.fit(X_train_all, y_train)

# Make predictions
train_preds = model.predict(X_train_all)
test_preds = model.predict(X_test_all)

# Calculate RMSPE
train_rmspe = np.sqrt(np.mean(np.square((y_train - train_preds) / np.maximum(1, y_train)))) * 100
test_rmspe = np.sqrt(np.mean(np.square((y_test - test_preds) / np.maximum(1, y_test)))) * 100

# Calculate MAE
train_mae = mean_absolute_error(y_train, train_preds)
test_mae = mean_absolute_error(y_test, test_preds)

# Calculate RMSE
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

# Calculate R2 scores
train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)

print(f'RMSPE on train set: {train_rmspe}%')
print(f'RMSPE on test set: {test_rmspe}%')
print(f'MAE on train set: {train_mae}')
print(f'MAE on test set: {test_mae}\n')
print(f'RMSE on train set: {train_rmse}')
print(f'RMSE on test set: {test_rmse}\n')
print(f'R2 on train set: {train_r2}')
print(f'R2 on test set: {test_r2}')

RMSPE on train set: 10.218785919573039%
RMSPE on test set: 22.707318484529974%
MAE on train set: 285.28869983344674
MAE on test set: 940.9397651420045

RMSE on train set: 416.3735632052112
RMSE on test set: 1350.7092240098104

R2 on train set: 0.9810491512430659
R2 on test set: 0.8055576195333052


In [8]:
#feature importance, then sorted by the importance value

df_imp = pd.DataFrame(dict(Feature=X_train_all.columns, Importance = model.feature_importances_))
df_imp.sort_values(by="Importance", ascending=False)

Unnamed: 0,Feature,Importance
2,sales_lag12,0.382531
3,sales_lag24,0.271131
1,sales_lag14,0.117087
0,sales_lag7,0.10918
7,Promo,0.073899
4,DayOfWeek,0.032207
5,encoded_StoreType,0.008842
6,encoded_Assortment,0.005123


### 3.2 Model Running (XGBoost)

In [9]:
# Select the columns
X_train = df_train[['sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','Promo']].dropna()
y_train = df_train.loc[X_train.index, 'Sales']
X_test = df_test[['sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','Promo']].dropna()
y_test = df_test.loc[X_test.index, 'Sales']

# Initialize the XGBoost model
model = XGBRegressor(n_estimators=100, max_depth=10, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# Calculate RMSPE
train_rmspe = np.sqrt(np.mean(np.square((y_train - train_preds) / np.maximum(1, y_train)))) * 100
test_rmspe = np.sqrt(np.mean(np.square((y_test - test_preds) / np.maximum(1, y_test)))) * 100

# Calculate MAE
train_mae = mean_absolute_error(y_train, train_preds)
test_mae = mean_absolute_error(y_test, test_preds)

# Calculate RMSE
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

# Calculate R2 scores
train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)

# Calculate Adjusted R2 scores
n_train = len(y_train)
p_train = X_train.shape[1]
train_adj_r2 = 1 - (1 - train_r2) * (n_train - 1) / (n_train - p_train - 1)

n_test = len(y_test)
p_test = X_test.shape[1]
test_adj_r2 = 1 - (1 - test_r2) * (n_test - 1) / (n_test - p_test - 1)

print(f'RMSPE on train set: {train_rmspe}%')
print(f'RMSPE on test set: {test_rmspe}%')
print(f'MAE on train set: {train_mae}')
print(f'MAE on test set: {test_mae}\n')
print(f'RMSE on train set: {train_rmse}')
print(f'RMSE on test set: {test_rmse}\n')
print(f'R2 on train set: {train_r2}')
print(f'R2 on test set: {test_r2}')
print(f'Adjusted R2 on train set: {train_adj_r2}')
print(f'Adjusted R2 on test set: {test_adj_r2}\n')

RMSPE on train set: 18.258539093440525%
RMSPE on test set: 23.77416715569446%
MAE on train set: 577.3570774274503
MAE on test set: 989.3784019194981

RMSE on train set: 808.2582215736248
RMSE on test set: 1426.6461266534486

R2 on train set: 0.9285892751360765
R2 on test set: 0.7830799373530548
Adjusted R2 on train set: 0.9285869042227193
Adjusted R2 on test set: 0.7830722297096329



### 4. Hyperparameter Tuning (RFR)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = dict(n_estimators=[50,100],
                  max_features=np.arange(0.1,1,0.5),
                  max_depth=[10,15])

combs = 1
for params in param_grid.values():
    combs *= len(params)
print(combs, "parameter combinations being tested by Random Search:")
param_grid

In [None]:
tscv = TimeSeriesSplit(n_splits=3)

In [None]:
rs = RandomizedSearchCV(RandomForestRegressor(), param_grid, cv=tscv, n_jobs=-1, n_iter=8)
rs.fit(X_train, y_train)
print("Best parameters found:", rs.best_params_)
print("Mean CV score of best parameters:", rs.best_score_)
# Before calculating the score, the model is refit using all training data.
print("Score of best parameters on test data:", rs.score(X_test, y_test))

Re-tuning the Hyperparameter on the Models

In [10]:
# Select the columns
X_train = df_train[['Store','sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','Promo']].dropna()
y_train = df_train.loc[X_train.index, 'Sales']
X_test = df_test[['Store','sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','Promo']].dropna()
y_test = df_test.loc[X_test.index, 'Sales']

# Initialize the model
model = RandomForestRegressor(n_estimators=100, max_features=0.5,max_depth=10, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# Calculate RMSPE
train_rmspe = np.sqrt(np.mean(np.square((y_train - train_preds) / np.maximum(1, y_train)))) * 100
test_rmspe = np.sqrt(np.mean(np.square((y_test - test_preds) / np.maximum(1, y_test)))) * 100

# Calculate MAE
train_mae = mean_absolute_error(y_train, train_preds)
test_mae = mean_absolute_error(y_test, test_preds)

# Calculate RMSE
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

# Calculate R2 scores
train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)

# Calculate Adjusted R2 scores
n_train = len(y_train)
p_train = X_train.shape[1]
train_adj_r2 = 1 - (1 - train_r2) * (n_train - 1) / (n_train - p_train - 1)

n_test = len(y_test)
p_test = X_test.shape[1]
test_adj_r2 = 1 - (1 - test_r2) * (n_test - 1) / (n_test - p_test - 1)


print(f'RMSPE on train set: {train_rmspe}%')
print(f'RMSPE on test set: {test_rmspe}%')
print(f'MAE on train set: {train_mae}')
print(f'MAE on test set: {test_mae}\n')
print(f'RMSE on train set: {train_rmse}')
print(f'RMSE on test set: {test_rmse}\n')
print(f'R2 on train set: {train_r2}')
print(f'R2 on test set: {test_r2}')
print(f'Adjusted R2 on train set: {train_adj_r2}')
print(f'Adjusted R2 on test set: {test_adj_r2}\n')

RMSPE on train set: 26.65517296971346%
RMSPE on test set: 22.85234575906463%
MAE on train set: 804.089070337907
MAE on test set: 924.7570524937908

RMSE on train set: 1134.0313216370778
RMSE on test set: 1328.9793158057792

R2 on train set: 0.8594233210159308
R2 on test set: 0.8117635857708128
Adjusted R2 on train set: 0.8594178758049208
Adjusted R2 on test set: 0.8117557825320374



Re-running the feature importance after hyperparameter tuning to the model

In [11]:
#feature importance, then sorted by the importance value

df_imp = pd.DataFrame(dict(Feature=X_train.columns, Importance = model.feature_importances_))
df_imp.sort_values(by="Importance", ascending=False)

Unnamed: 0,Feature,Importance
3,sales_lag12,0.37555
4,sales_lag24,0.287306
1,sales_lag7,0.120307
6,Promo,0.095766
2,sales_lag14,0.094611
5,DayOfWeek,0.023417
0,Store,0.003042


### 5. Making Prediction at Training dataset

In [12]:
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# Make predictions for the test set
test_preds = model.predict(df_train[['Store','sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','Promo']])

# Add the predictions to the DataFrame
df_train['predicted_sales'] = test_preds

# Display the date, predicted value of each product at each store
print(df_train[['Date', 'Store', 'Sales', 'predicted_sales']])


             Date  Store  Sales  predicted_sales
0      2013-01-01     85   4220      2828.277336
1      2013-01-01    259   6851      2803.372613
2      2013-01-01    262  17267      2803.372613
3      2013-01-01    274   3102      2803.372613
4      2013-01-01    335   2401      2855.116632
...           ...    ...    ...              ...
207479 2013-08-13    670   9071      9889.046206
207480 2013-08-13    671   5335      7443.007870
207481 2013-08-13    672  10309     11113.808939
207482 2013-08-13    673   7482      8741.900582
207483 2013-08-13    674   9229      8805.707838

[207484 rows x 4 columns]


### 6. Setting Predicted sales of Training dataset to Plots.

In [13]:
train_result_agg = df_train.set_index('Date').resample('D')[['Sales', 'predicted_sales']].mean().reset_index()

In [14]:
fig = px.line(train_result_agg, x='Date', y='Sales', title='Actual vs Prediction Sales on Train dataset')
fig.add_scatter(x=train_result_agg['Date'], y=train_result_agg['predicted_sales'], mode='lines', name='predicted')

# Adjust the size of the plot
fig.update_layout(height=350, width=1000)

fig.show()

In [15]:
store_train = df_train.groupby(['Store', 'Date'])[['Sales', 'predicted_sales']].mean().reset_index()
store_df = store_train
fig = make_subplots(rows=18, cols=3, subplot_titles=[f'Store {store}' for store in store_train.Store.unique()])
n=1
for row in range (1,19):
  for col in range(1,4):
    df = store_df[store_df['Store'] == n]
    n += 1

    px_fig = px.line(df, x='Date', y='Sales')
    px_fig.add_scatter(x=df['Date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Stores on Train dataset')

fig.show()


### --------------------------------------------------------------------------------

### 7. Making Prediction at Validation dataset

In [16]:
# Fill NaN values with 0
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# Make predictions for the test set
test_preds = model.predict(df_test[['Store','sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','Promo']])

# Add the predictions to the DataFrame
df_test['predicted_sales'] = test_preds

# Display the date, predicted value of each product at each store
print(df_test[['Date', 'Sales','Store', 'predicted_sales']])


             Date  Sales  Store  predicted_sales
0      2015-01-30   8120    627      6775.940032
1      2015-01-30  11129    628      7106.208716
2      2015-01-30   6175    629      7995.053752
3      2015-01-30   7505    630      7567.960476
4      2015-01-30   7192    631      5657.013408
...           ...    ...    ...              ...
168863 2015-07-31   5723   1111      5008.385533
168864 2015-07-31   9626   1112      9218.097595
168865 2015-07-31   7289   1113      7678.181369
168866 2015-07-31  27508   1114     23139.074885
168867 2015-07-31   8680   1115      7745.044657

[168868 rows x 4 columns]


In [17]:
test_result_agg = df_test.set_index('Date').resample('D')[['Sales', 'predicted_sales']].mean().reset_index()

In [18]:
fig2 = px.line(test_result_agg, x='Date', y='Sales', title='Actual vs Prediction Sales on Validation set')
fig2.add_scatter(x=test_result_agg['Date'], y=test_result_agg['predicted_sales'], mode='lines', name='predicted')

# Adjust the size of the plot
fig2.update_layout(height=350, width=1000)

fig2.show()


In [19]:
store_validation = df_test.groupby(['Store', 'Date'])[['Sales', 'predicted_sales']].mean().reset_index()
store_df = store_validation
fig = make_subplots(rows=18, cols=3, subplot_titles=[f'Store {store}' for store in store_validation.Store.unique()])
n = 1

for row in range(1, 19):
    for col in range(1, 4):
        df = store_df[store_df['Store'] == n]
        n += 1

        px_fig = px.line(df, x='Date', y='Sales')
        px_fig.add_scatter(x=df['Date'], y=df['predicted_sales'], mode='lines')

        for trace in px_fig['data']:
            fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text='Sales Prediction by Stores on Validation dataset')

fig.show()


### 8. Making Prediction at Test dataset

In [20]:
forecasting_df['Date'] = pd.to_datetime(forecasting_df['Date'])

# Fill NaN values with 0
forecasting_df.fillna(0, inplace=True)

# Make predictions for the test set
forecasting_preds = model.predict(forecasting_df[['Store','sales_lag7','sales_lag14','sales_lag12', 'sales_lag24','DayOfWeek','Promo']])

# Add the predictions to the DataFrame
forecasting_df['predicted_sales'] = forecasting_preds

# Display the date, predicted value of each product at each store
print(forecasting_df[['Date', 'Sales','Store', 'predicted_sales']])


            Date  Sales  Store  predicted_sales
0     2015-08-03    0.0      1      6473.815472
1     2015-08-10    0.0      1      4087.545771
2     2015-08-17    0.0      1      6310.527660
3     2015-08-24    0.0      1      4208.019686
4     2015-08-31    0.0      1      6104.395770
...          ...    ...    ...              ...
41083 2015-08-16    0.0   1115      1703.118095
41084 2015-08-23    0.0   1115      1703.118095
41085 2015-08-30    0.0   1115      1703.118095
41086 2015-09-06    0.0   1115      1703.118095
41087 2015-09-13    0.0   1115      1703.118095

[41088 rows x 4 columns]


In [21]:
figMV = px.line(forecasting_df, x='Date', y=['Sales'], title='Actual vs Prediction Sales on Forecasting dataset')
figMV.show()


In [22]:
forecast_result_agg = forecasting_df.set_index('Date').resample('D')[['Sales', 'predicted_sales']].mean().reset_index()

In [23]:
fig3 = px.line(forecast_result_agg, x='Date', y='Sales', title='Actual vs Prediction Sales on Forecasting dataset')
fig3.add_scatter(x=forecast_result_agg['Date'], y=forecast_result_agg['predicted_sales'], mode='lines', name='predicted')

fig3.update_layout(height=350, width=1000)
fig3.show()


In [24]:
store_forecast = forecasting_df.groupby(['Store', 'Date'])[['Sales', 'predicted_sales']].mean().reset_index()
store_df = store_forecast
fig = make_subplots(rows=18, cols=3, subplot_titles=[f'Store {store}' for store in store_forecast.Store.unique()])
n=1
for row in range (1,19):
  for col in range(1,4):
    df = store_df[store_df['Store'] == n]
    n += 1

    px_fig = px.line(df, x='Date', y='Sales')
    px_fig.add_scatter(x=df['Date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Stores on Forecasting dataset')

fig.show()


# END OF CODE