### 1. Importing all the libraries
### Model used - Random Forest Regressor

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit

import plotly.express as px
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
pio.renderers.default = 'colab'
pio.templates.default = 'ggplot2'

from plotly.subplots import make_subplots

### 2. Read the CSV

In [None]:
df_train = pd.read_csv('train_preprocessed')
df_train.isna().any()

In [None]:
df_test = pd.read_csv('test_preprocessed')
df_test.isna().any()

### 3. Model Running

In [None]:
# Convert 'date' column to datetime format
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

# set 'sales' as target variable
y_train = df_train['sales']
X_train = df_train.drop('sales', axis=1)

y_test = df_test['sales']
X_test = df_test.drop('sales', axis=1)

In [None]:
# Select the columns
X_train = df_train[['special_offer', 'day of week', 'store_nbr','encoded_product_type','sales_lag365','sales_lag21','rolling_means7']].dropna()
y_train = df_train.loc[X_train.index, 'sales']
X_test = df_test[['special_offer', 'day of week', 'store_nbr','encoded_product_type','sales_lag365','sales_lag21','rolling_means7']].dropna()
y_test = df_test.loc[X_test.index, 'sales']

# Initialize the model
model = xgb.XGBRegressor(n_estimators=1000, objective='reg:squarederror')

# Fit the model
model.fit(X_train, y_train, verbose = False)

# Make predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# Calculate MAE
train_mae = mean_absolute_error(y_train, train_preds)
test_mae = mean_absolute_error(y_test, test_preds)

# Calculate RMSE
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

# Calculate R2 scores
train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)

# Calculate Adjusted R2 scores
n_train = len(y_train)
p_train = X_train.shape[1]
train_adj_r2 = 1 - (1 - train_r2) * (n_train - 1) / (n_train - p_train - 1)

n_test = len(y_test)
p_test = X_test.shape[1]
test_adj_r2 = 1 - (1 - test_r2) * (n_test - 1) / (n_test - p_test - 1)

print(f'MAE on train set: {train_mae}')
print(f'MAE on test set: {test_mae}\n')
print(f'RMSE on train set: {train_rmse}')
print(f'RMSE on test set: {test_rmse}\n')
print(f'Adjusted R2 on train set: {train_adj_r2}')
print(f'Adjusted R2 on test set: {test_adj_r2}\n')

In [None]:
#feature importance, then sorted by the importance value

df_imp = pd.DataFrame(dict(Feature=X_train.columns, Importance = model.feature_importances_))
df_imp.sort_values(by="Importance", ascending=False)

### 4. Making Prediction at Training dataset

Code below running 2 operations:

1. Makes prediction for training datasets on the ‘special_offer’, ‘store_nbr’, ‘product_type’, ‘sales_lag7’, and ‘rolling_means7’ as predictors to 'predicted_sales' as target

2. Displays the date, predicted value of each product at each store: It prints out the ‘date’, ‘store_nbr’, ‘product_type’, ‘sales’, and ‘predicted_sales’ columns of the df_train dataframe. This gives us a view of the actual and predicted sales for each product at each store on each date.

In [None]:
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# Make predictions for the test set
test_preds = model.predict(df_train[['special_offer', 'day of week', 'store_nbr','encoded_product_type','sales_lag365','sales_lag21','rolling_means7']])

# Add the predictions to the DataFrame
df_train['predicted_sales'] = test_preds

# Display the date, predicted value of each product at each store
print(df_train[['date', 'store_nbr', 'encoded_product_type','sales', 'predicted_sales']])


### 5. Setting Predicted sales of Training dataset to Plots.

In [None]:
train_result_agg = df_train.set_index('date').resample('D')[['sales', 'predicted_sales']].mean().reset_index()

In [None]:
fig = px.line(train_result_agg, x='date', y='sales', title='Actual vs Prediction Sales on Train dataset')
fig.add_scatter(x=train_result_agg['date'], y=train_result_agg['predicted_sales'], mode='lines')
fig.show()


In [None]:
store_train = df_train.groupby(['store_nbr', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
store_df = store_train
fig = make_subplots(rows=18, cols=3, subplot_titles=[f'Store {store}' for store in store_train.store_nbr.unique()])
n=1
for row in range (1,19):
  for col in range(1,4):
    df = store_df[store_df['store_nbr'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Stores on Train dataset')

fig.show()


In [None]:
product_test = df_train.groupby(['encoded_product_type', 'product_type', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
product_df = product_test
fig = make_subplots(rows=11, cols=3, subplot_titles=[f'{product}' for product in product_test.product_type.unique()])
n=0
for row in range (1,12):
  for col in range(1,4):
    df = product_df[product_df['encoded_product_type'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Product on Train dataset')

fig.show()


### --------------------------------------------------------------------------------

### 6. Making Prediction at Test dataset (31 July - 15 August)

In [None]:
# Fill NaN values with 0
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# Make predictions for the test set
test_preds = model.predict(df_test[['special_offer', 'day of week', 'store_nbr','encoded_product_type','sales_lag365','sales_lag21','rolling_means7']])

# Add the predictions to the DataFrame
df_test['predicted_sales'] = test_preds

# Display the date, predicted value of each product at each store
print(df_test[['date', 'store_nbr', 'encoded_product_type','sales', 'predicted_sales']])


In [None]:
test_result_agg = df_test.set_index('date').resample('D')[['sales', 'predicted_sales']].mean().reset_index()

In [None]:
fig2 = px.line(test_result_agg, x='date', y='sales', title='Actual vs Prediction Sales on Test dataset')
fig2.add_scatter(x=test_result_agg['date'], y=test_result_agg['predicted_sales'], mode='lines')
fig2.show()


### 7. Setting Predicted sales of Test dataset to Plots.

In [None]:
store_train = df_test.groupby(['store_nbr', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
store_df = store_train
fig = make_subplots(rows=18, cols=3, subplot_titles=[f'Store {store}' for store in store_train.store_nbr.unique()])
n=1
for row in range (1,19):
  for col in range(1,4):
    df = store_df[store_df['store_nbr'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Stores on Test dataset')

fig.show()


In [None]:
product_test = df_test.groupby(['encoded_product_type', 'product_type', 'date'])[['sales', 'predicted_sales']].mean().reset_index()
product_df = product_test
fig = make_subplots(rows=11, cols=3, subplot_titles=[f'{product}' for product in product_test.product_type.unique()])
n=0
for row in range (1,12):
  for col in range(1,4):
    df = product_df[product_df['encoded_product_type'] == n]
    n += 1

    px_fig = px.line(df, x='date', y='sales')
    px_fig.add_scatter(x=df['date'], y=df['predicted_sales'], mode='lines')

    for trace in px_fig['data']:
      fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=4000, width=2000, title_text = 'Sales Prediction by Product on Test dataset')

fig.show()
