### 1. Importing all the libraries
### Model used - Random Forest Regressor

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

!pip install plotly
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

### 2. Read the CSV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_train = pd.read_csv('train_preprocessed')
df_train.isna().any()

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/_COURSES_/PBA/Preprocessing/train_preprocessed')
df_train.isna().any()

In [None]:
df_train.tail(5)

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/_COURSES_/PBA/Preprocessing/test_preprocessed')
df_test.isna().any()

In [None]:
df_test = pd.read_csv('test_preprocessed')
df_test.isna().any()

In [None]:
df_test.tail(5)

In [None]:
df_train.head(5)

### 3. Model Running

In [None]:
# Convert 'date' column to datetime format
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

# set 'sales' as target variable
y_train = df_train['sales']
X_train = df_train.drop('sales', axis=1)

y_test = df_test['sales']
X_test = df_test.drop('sales', axis=1)

Summary of the steps below:

1. We choose 'special_offer','day of week', 'store_nbr','encoded_product_type','sales_lag365','sales_lag21','rolling_means7' as the predictors to the sales.

2. Set the training and test dataframe (usually we will split the datasets in this phase, yet we already did that during EDA phase).

3. Initialize the model. It initialize the RandomForestRegressor (RFR) model with 100 estimators, a max feature 0.5, and a random state of 42 for reproducibility. This number is decided using GridSearch (library to tuning the best hyperparameter), the process of modelling and parameter tuning is a back and forth process.

4. Fits the model: The model is trained using the fit method on the training data.

5. Makes predictions: The trained model is used to make predictions on both the training and testing data.

6. The Mean Absolute Error (MAE) is calculated for both the training and testing predictions. This is a measure of the differences between the predicted and actual values.

BRIEF CONCLUSION! Based MAE & MAPE score below, the model demonstrates relatively good performance as indicated by relatively low MAE values on the training and test datasets, suggesting it captures a significant portion of the variance in the data.

In [None]:
# Select the columns
X_train_all = df_train[['special_offer','day of week', 'store_nbr' ,'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']].dropna()
y_train = df_train.loc[X_train_all.index, 'sales']
X_test_all = df_test[['special_offer', 'day of week', 'store_nbr' ,'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']].dropna()
y_test = df_test.loc[X_test_all.index, 'sales']

# Initialize the model
model = RandomForestRegressor(n_estimators=100, max_features=0.1, random_state=42)

# Fit the model
model.fit(X_train_all, y_train)

# Make predictions
train_preds = model.predict(X_train_all)
test_preds = model.predict(X_test_all)

# Calculate MAE
train_mae = mean_absolute_error(y_train, train_preds)
test_mae = mean_absolute_error(y_test, test_preds)

# Calculate RMSE
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

# Calculate R2 scores
train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)

# Calculate Adjusted R2 scores
n_train = len(y_train)
p_train = X_train_all.shape[1]
train_adj_r2 = 1 - (1 - train_r2) * (n_train - 1) / (n_train - p_train - 1)

n_test = len(y_test)
p_test = X_test_all.shape[1]
test_adj_r2 = 1 - (1 - test_r2) * (n_test - 1) / (n_test - p_test - 1)

print(f'MAE on train set: {train_mae}')
print(f'MAE on test set: {test_mae}\n')
print(f'RMSE on train set: {train_rmse}')
print(f'RMSE on test set: {test_rmse}\n')
print(f'Adjusted R2 on train set: {train_adj_r2}')
print(f'Adjusted R2 on test set: {test_adj_r2}\n')

Code below running 1 operations:

1. Sorting the feature by its importance to the model performance, resulting that rolling_means7 as the most important feature, followed by sales_lag21, sales_lag365, special_offer, and day_of_week.

BRIEF CONCLUSION!
rolling_means7 become the most important and suggesting that the rolling mean (moving average) over a 7-day period has the most substantial influence on predicting sales (generally on all stores). Morever, sales lagged by a 3-week also seems to be a significant factor in determining future sales patterns, meaning that the day of the previous 3-week pattern become one of the most important trend.

In [None]:
#feature importance, then sorted by the importance value

df_imp = pd.DataFrame(dict(Feature=X_train_all.columns, Importance = model.feature_importances_))
df_imp.sort_values(by="Importance", ascending=False)

### 4. Hyperparameter Tuning

Code below running 1 operations:

1. RandomizedSearchCV, this is a parameter tuning operation. As explained before, this operation is usually run after we create the first model and did not sure yet whether the previous parameters were already the best or not. Using 24 parameter combinations, the results are recommending us to set the RFR model at {'max_features': 0.1, 'n_estimators': 100, max_depth=15}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = dict(n_estimators=[50,100],
                  max_features=np.arange(0.1,1,0.5),
                  max_depth=[10,15])

combs = 1
for params in param_grid.values():
    combs *= len(params)
print(combs, "parameter combinations being tested by Random Search:")
param_grid

In [None]:
tscv = TimeSeriesSplit(n_splits=3)

In [None]:
rs = RandomizedSearchCV(RandomForestRegressor(), param_grid, cv=tscv, n_jobs=-1, n_iter=8)
rs.fit(X_train, y_train)
print("Best parameters found:", rs.best_params_)
print("Mean CV score of best parameters:", rs.best_score_)
# Before calculating the score, the model is refit using all training data.
print("Score of best parameters on test data:", rs.score(X_test, y_test))

Re-tuning the Hyperparameter on the Models

In [None]:
# Select the columns
X_train = df_train[['special_offer', 'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']].dropna()
y_train = df_train.loc[X_train.index, 'sales']
X_test = df_test[['special_offer', 'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']].dropna()
y_test = df_test.loc[X_test.index, 'sales']

# Initialize the model
model = RandomForestRegressor(n_estimators=50, max_features=0.6,max_depth=10, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# Calculate MAE
train_mae = mean_absolute_error(y_train, train_preds)
test_mae = mean_absolute_error(y_test, test_preds)

# Calculate RMSE
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

# Calculate R2 scores
train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)

# Calculate Adjusted R2 scores
n_train = len(y_train)
p_train = X_train.shape[1]
train_adj_r2 = 1 - (1 - train_r2) * (n_train - 1) / (n_train - p_train - 1)

n_test = len(y_test)
p_test = X_test.shape[1]
test_adj_r2 = 1 - (1 - test_r2) * (n_test - 1) / (n_test - p_test - 1)

print(f'MAE on train set: {train_mae}')
print(f'MAE on test set: {test_mae}\n')
print(f'RMSE on train set: {train_rmse}')
print(f'RMSE on test set: {test_rmse}\n')
print(f'Adjusted R2 on train set: {train_adj_r2}')
print(f'Adjusted R2 on test set: {test_adj_r2}\n')

Re-running the feature importance after hyperparameter tuning to the model

In [None]:
#feature importance, then sorted by the importance value

df_imp = pd.DataFrame(dict(Feature=X_train.columns, Importance = model.feature_importances_))
df_imp.sort_values(by="Importance", ascending=False)

### 5. Making Prediction at Training dataset

Code below running 2 operations:

1. Makes prediction for training datasets on the special_offer', 'day of week', 'store_nbr','encoded_product_type','sales_lag365','sales_lag21','rolling_means7' as predictors to 'predicted_sales' as target

2. Displays the date, predicted value of each product at each store: It prints out the ‘date’, ‘store_nbr’, ‘product_type’, ‘sales’, and ‘predicted_sales’ columns of the df_train dataframe. This gives us a view of the actual and predicted sales for each product at each store on each date.

In [None]:
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# Make predictions for the test set
test_preds = model.predict(df_train[['special_offer','encoded_product_type','sales_lag365','sales_lag21','rolling_means7']])

# Add the predictions to the DataFrame
df_train['predicted_sales'] = test_preds

# Display the date, predicted value of each product at each store
print(df_train[['date', 'store_nbr', 'encoded_product_type','sales', 'predicted_sales']])


### 6. Setting Predicted sales of Training dataset to Plots.

In [None]:
train_result_agg = df_train.set_index('date').resample('D')[['sales', 'predicted_sales']].mean().reset_index()

In [None]:
fig = px.line(train_result_agg, x='date', y='sales', title='Actual vs Prediction Sales on Train dataset')
fig.add_scatter(x=train_result_agg['date'], y=train_result_agg['predicted_sales'], mode='lines', name='predicted')
fig.show()


In [None]:
store_train = df_train.groupby(['store_nbr', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
store_df = store_train
fig = make_subplots(rows=18, cols=3, subplot_titles=[f'Store {store}' for store in store_train.store_nbr.unique()])
n=1
for row in range (1,19):
  for col in range(1,4):
    df = store_df[store_df['store_nbr'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Stores on Train dataset')

fig.show()


In [None]:
product_test = df_train.groupby(['encoded_product_type', 'product_type', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
product_df = product_test
fig = make_subplots(rows=11, cols=3, subplot_titles=[f'{product}' for product in product_test.product_type.unique()])
n=0
for row in range (1,12):
  for col in range(1,4):
    df = product_df[product_df['encoded_product_type'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Product on Train dataset')

fig.show()


### --------------------------------------------------------------------------------

### 7. Making Prediction at Test dataset (31 July - 15 August)

In [None]:
# Fill NaN values with 0
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# Make predictions for the test set
test_preds = model.predict(df_test[['special_offer', 'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']])

# Add the predictions to the DataFrame
df_test['predicted_sales'] = test_preds

# Display the date, predicted value of each product at each store
print(df_test[['date', 'store_nbr', 'encoded_product_type','sales', 'predicted_sales']])


In [None]:
test_result_agg = df_test.set_index('date').resample('D')[['sales', 'predicted_sales']].mean().reset_index()

In [None]:
fig2 = px.line(test_result_agg, x='date', y='sales', title='Actual vs Prediction Sales on Test dataset')
fig2.add_scatter(x=test_result_agg['date'], y=test_result_agg['predicted_sales'], mode='lines', name='predicted')
fig2.show()


### 8. Setting Predicted sales of Test dataset to Plots.

In [None]:
store_train = df_test.groupby(['store_nbr', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
store_df = store_train
fig = make_subplots(rows=18, cols=3, subplot_titles=[f'Store {store}' for store in store_train.store_nbr.unique()])
n=1
for row in range (1,19):
  for col in range(1,4):
    df = store_df[store_df['store_nbr'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Stores on Test dataset')

fig.show()


In [None]:
product_test = df_test.groupby(['encoded_product_type', 'product_type', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
product_df = product_test
fig = make_subplots(rows=11, cols=3, subplot_titles=[f'{product}' for product in product_test.product_type.unique()])
n=0
for row in range (1,12):
  for col in range(1,4):
    df = product_df[product_df['encoded_product_type'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Product on Test dataset')

fig.show()


Aggregate the MAE metrics into each Store and Product to get a better granular performance metrics

In [None]:
# Create a mapping from encoded_product_type to product_type
product_mapping = df_train.set_index('encoded_product_type')['product_type'].to_dict()

# Calculate and print MAE by store
for store_nbr in df_train['store_nbr'].unique():
    # Subset the data
    X_train_subset = X_train_all.loc[X_train_all['store_nbr'] == store_nbr, ['special_offer', 'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']]
    y_train_subset = y_train.loc[X_train_subset.index]
    X_test_subset = X_test_all.loc[X_test_all['store_nbr'] == store_nbr, ['special_offer', 'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']]
    y_test_subset = y_test.loc[X_test_subset.index]

    # Make predictions
    train_preds_subset = model.predict(X_train_subset)
    test_preds_subset = model.predict(X_test_subset)

    # Calculate MAE
    train_mae_subset = mean_absolute_error(y_train_subset, train_preds_subset)
    test_mae_subset = mean_absolute_error(y_test_subset, test_preds_subset)

    # Print MAE
    print(f'MAE on train set for store {store_nbr}: {train_mae_subset}')
    print(f'MAE on test set for store {store_nbr}: {test_mae_subset}\n')

# Calculate and print MAE by product
for encoded_product_type in df_train['encoded_product_type'].unique():
    # Subset the data
    X_train_subset = X_train[X_train['encoded_product_type'] == encoded_product_type]
    y_train_subset = y_train.loc[X_train_subset.index]
    X_test_subset = X_test[X_test['encoded_product_type'] == encoded_product_type]
    y_test_subset = y_test.loc[X_test_subset.index]

    # Make predictions
    train_preds_subset = model.predict(X_train_subset)
    test_preds_subset = model.predict(X_test_subset)

    # Calculate MAE
    train_mae_subset = mean_absolute_error(y_train_subset, train_preds_subset)
    test_mae_subset = mean_absolute_error(y_test_subset, test_preds_subset)

    # Get the corresponding product_type
    product_type = product_mapping.get(encoded_product_type, 'Unknown')

    # Print MAE
    print(f'MAE on train set for product {product_type}: {train_mae_subset}')
    print(f'MAE on test set for product {product_type}: {test_mae_subset}\n')


In [None]:
#import to excel file format

# Initialize an empty DataFrame to store the results
results = pd.DataFrame(columns=['Store/Product', 'Train MAE', 'Test MAE'])

# Calculate and print MAE by store
for store_nbr in df_train['store_nbr'].unique():
   # Subset the data
    X_train_subset = X_train_all.loc[X_train_all['store_nbr'] == store_nbr, ['special_offer', 'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']]
    y_train_subset = y_train.loc[X_train_subset.index]
    X_test_subset = X_test_all.loc[X_test_all['store_nbr'] == store_nbr, ['special_offer', 'encoded_product_type','sales_lag365','sales_lag21','rolling_means7']]
    y_test_subset = y_test.loc[X_test_subset.index]

    # Make predictions
    train_preds_subset = model.predict(X_train_subset)
    test_preds_subset = model.predict(X_test_subset)

    # Calculate MAE
    train_mae_subset = mean_absolute_error(y_train_subset, train_preds_subset)
    test_mae_subset = mean_absolute_error(y_test_subset, test_preds_subset)

    # Append to results DataFrame
    results = results.append({'Store/Product': f'Store {store_nbr}', 'Train MAE': train_mae_subset, 'Test MAE': test_mae_subset}, ignore_index=True)

# Calculate and print MAE by product
for encoded_product_type in df_train['encoded_product_type'].unique():
    # Subset the data
    X_train_subset = X_train[X_train['encoded_product_type'] == encoded_product_type]
    y_train_subset = y_train.loc[X_train_subset.index]
    X_test_subset = X_test[X_test['encoded_product_type'] == encoded_product_type]
    y_test_subset = y_test.loc[X_test_subset.index]

    # Make predictions
    train_preds_subset = model.predict(X_train_subset)
    test_preds_subset = model.predict(X_test_subset)

    # Calculate MAE
    train_mae_subset = mean_absolute_error(y_train_subset, train_preds_subset)
    test_mae_subset = mean_absolute_error(y_test_subset, test_preds_subset)

    # Get the corresponding product_type
    product_type = product_mapping.get(encoded_product_type, 'Unknown')

    # Append to results DataFrame
    results = results.append({'Store/Product': f'Product {product_type}', 'Train MAE': train_mae_subset, 'Test MAE': test_mae_subset}, ignore_index=True)

# Export to Excel
results.to_excel('mae_results.xlsx', index=False)


In [None]:
# Average prediction of sales by store and product

avgPredStore = df_test.groupby('store_nbr')['predicted_sales'].mean().reset_index()
avgPredStore.to_excel('avg_store.xlsx', index=False)

In [None]:
avgPredProd = df_test.groupby('product_type')['predicted_sales'].mean().reset_index()
avgPredProd.to_excel('avg_product.xlsx', index=False)