# 1. Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load combined_df from ../data/processed
combined_df = pd.read_csv('../data/processed/combined_data.csv')
combined_df

Unnamed: 0,item_id,store_id,d,sales_qty,date,wm_yr_wk,sell_price
0,HOBBIES_1_001,CA_1,d_1,0,2011-01-29,11101,
1,HOBBIES_1_002,CA_1,d_1,0,2011-01-29,11101,
2,HOBBIES_1_003,CA_1,d_1,0,2011-01-29,11101,
3,HOBBIES_1_004,CA_1,d_1,0,2011-01-29,11101,
4,HOBBIES_1_005,CA_1,d_1,0,2011-01-29,11101,
...,...,...,...,...,...,...,...
46985085,FOODS_3_823,WI_3,d_1541,2,2015-04-18,11512,2.88
46985086,FOODS_3_824,WI_3,d_1541,0,2015-04-18,11512,2.00
46985087,FOODS_3_825,WI_3,d_1541,0,2015-04-18,11512,3.98
46985088,FOODS_3_826,WI_3,d_1541,0,2015-04-18,11512,1.28


In [3]:
# Drop columns 'item_id', 'store_id', 'd','wm_yr_wk' from combined_df
combined_df.drop(['item_id', 'store_id', 'd','wm_yr_wk'], axis=1, inplace=True)
combined_df

Unnamed: 0,sales_qty,date,sell_price
0,0,2011-01-29,
1,0,2011-01-29,
2,0,2011-01-29,
3,0,2011-01-29,
4,0,2011-01-29,
...,...,...,...
46985085,2,2015-04-18,2.88
46985086,0,2015-04-18,2.00
46985087,0,2015-04-18,3.98
46985088,0,2015-04-18,1.28


In [4]:
# Load calendar_events from ../data/processed
df_event = pd.read_csv('../data/raw/calendar_events.csv')
df_event

Unnamed: 0,date,event_name,event_type
0,2011-02-06,SuperBowl,Sporting
1,2011-02-14,ValentinesDay,Cultural
2,2011-02-21,PresidentsDay,National
3,2011-03-09,LentStart,Religious
4,2011-03-16,LentWeek2,Religious
...,...,...,...
162,2016-05-30,MemorialDay,National
163,2016-06-02,NBAFinalsStart,Sporting
164,2016-06-07,Ramadan starts,Religious
165,2016-06-19,Father's day,Cultural


In [7]:
# Drop column event_type
df_event.drop('event_type', axis=1, inplace=True)

In [8]:
# Merge combined_df and df_event on common column 'date' with left join
final_df = pd.merge(combined_df, df_event, on='date', how='left')
final_df 

Unnamed: 0,sales_qty,date,sell_price,event_name
0,0,2011-01-29,,
1,0,2011-01-29,,
2,0,2011-01-29,,
3,0,2011-01-29,,
4,0,2011-01-29,,
...,...,...,...,...
47107045,2,2015-04-18,2.88,
47107046,0,2015-04-18,2.00,
47107047,0,2015-04-18,3.98,
47107048,0,2015-04-18,1.28,


In [9]:
# Save final_df to folder data/processed
pd.DataFrame(final_df).to_csv('../data/processed/final_df_forecast.csv', index=False)

# 2. Data Preprocessing

In [53]:
# Make a copy of final_df and save as final_df_clean
final_df_clean = final_df.copy()

In [54]:
final_df_clean.isna().sum()

sales_qty            0
date                 0
sell_price    12291876
event_name    43143350
dtype: int64

In [55]:
# Replace NaN values in column 'sell_price' with value 0
final_df_clean['sell_price'] = final_df_clean['sell_price'].replace(np.nan, 0, regex=True)

In [56]:
# Create a column named sales_revenue
final_df_clean['sales_revenue'] = final_df_clean['sales_qty'] * final_df_clean['sell_price']

In [57]:
# Drop 'sales_qty', 'sell_price' in combined_df_clean
final_df_clean = final_df_clean.drop(['sales_qty', 'sell_price'], axis=1)
final_df_clean

Unnamed: 0,date,event_name,sales_revenue
0,2011-01-29,,0.00
1,2011-01-29,,0.00
2,2011-01-29,,0.00
3,2011-01-29,,0.00
4,2011-01-29,,0.00
...,...,...,...
47107045,2015-04-18,,5.76
47107046,2015-04-18,,0.00
47107047,2015-04-18,,0.00
47107048,2015-04-18,,0.00


In [58]:
# Sort the final_df_clean by date to ensure its chronological order
final_df_clean.sort_values(by='date', inplace=True)

In [59]:
# Shift the 'sales_revenue' column by 7 days to create the target variable
final_df_clean['target_revenue'] = final_df_clean['sales_revenue'].shift(-7)
final_df_clean

Unnamed: 0,date,event_name,sales_revenue,target_revenue
0,2011-01-29,,0.00,0.0
20334,2011-01-29,,0.00,0.0
20333,2011-01-29,,0.00,0.0
20332,2011-01-29,,18.90,0.0
20331,2011-01-29,,2.42,0.0
...,...,...,...,...
47086716,2015-04-18,,0.00,
47086715,2015-04-18,,4.52,
47086714,2015-04-18,,0.00,
47086726,2015-04-18,,0.00,


In [60]:
# Replace NaN values in column 'target_revenue' with value 0
final_df_clean['target_revenue'] = final_df_clean['target_revenue'].replace(np.nan, 0, regex=True)

In [61]:
# Drop 'sales_revenue' in final_df_clean
final_df_clean = final_df_clean.drop(['sales_revenue'], axis=1)
final_df_clean

Unnamed: 0,date,event_name,target_revenue
0,2011-01-29,,0.0
20334,2011-01-29,,0.0
20333,2011-01-29,,0.0
20332,2011-01-29,,0.0
20331,2011-01-29,,0.0
...,...,...,...
47086716,2015-04-18,,0.0
47086715,2015-04-18,,0.0
47086714,2015-04-18,,0.0
47086726,2015-04-18,,0.0


In [62]:
# Replace NaN values in column 'event_name' with value 'NotApplicable'
final_df_clean['event_name'] = final_df_clean['event_name'].replace(np.nan, 'NotApplicable', regex=True)
final_df_clean

Unnamed: 0,date,event_name,target_revenue
0,2011-01-29,NotApplicable,0.0
20334,2011-01-29,NotApplicable,0.0
20333,2011-01-29,NotApplicable,0.0
20332,2011-01-29,NotApplicable,0.0
20331,2011-01-29,NotApplicable,0.0
...,...,...,...
47086716,2015-04-18,NotApplicable,0.0
47086715,2015-04-18,NotApplicable,0.0
47086714,2015-04-18,NotApplicable,0.0
47086726,2015-04-18,NotApplicable,0.0


In [8]:
# Downsample df_clean by using every 5th row and save as df_clean_sample
# df_clean_sample = combined_df_clean.iloc[::5, :]

In [19]:
# Set date column to be dataframe index
# final_df_clean.set_index('date')

Unnamed: 0_level_0,event_name,sales_revenue
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-29,NotApplicable,0.00
2011-01-29,NotApplicable,0.00
2011-01-29,NotApplicable,0.00
2011-01-29,NotApplicable,0.00
2011-01-29,NotApplicable,0.00
...,...,...
2015-04-18,NotApplicable,5.76
2015-04-18,NotApplicable,0.00
2015-04-18,NotApplicable,0.00
2015-04-18,NotApplicable,0.00


In [63]:
# Convert the 'date' column to a datetime type
final_df_clean['date'] = pd.to_datetime(final_df_clean['date'])
final_df_clean

Unnamed: 0,date,event_name,target_revenue
0,2011-01-29,NotApplicable,0.0
20334,2011-01-29,NotApplicable,0.0
20333,2011-01-29,NotApplicable,0.0
20332,2011-01-29,NotApplicable,0.0
20331,2011-01-29,NotApplicable,0.0
...,...,...,...
47086716,2015-04-18,NotApplicable,0.0
47086715,2015-04-18,NotApplicable,0.0
47086714,2015-04-18,NotApplicable,0.0
47086726,2015-04-18,NotApplicable,0.0


In [64]:
# Import datetime as dt
import datetime as dt

In [65]:
# Create features 'year', 'quarter', 'month', 'day_of_week' from column 'date'
final_df_clean['year'] = final_df_clean['date'].dt.year
final_df_clean['quarter'] = final_df_clean['date'].dt.quarter
final_df_clean['month'] = final_df_clean['date'].dt.month
final_df_clean['day_of_week'] = final_df_clean['date'].dt.dayofweek  # 0=Monday, 1=Tuesday, ..., 6=Sunday

final_df_clean

Unnamed: 0,date,event_name,target_revenue,year,quarter,month,day_of_week
0,2011-01-29,NotApplicable,0.0,2011,1,1,5
20334,2011-01-29,NotApplicable,0.0,2011,1,1,5
20333,2011-01-29,NotApplicable,0.0,2011,1,1,5
20332,2011-01-29,NotApplicable,0.0,2011,1,1,5
20331,2011-01-29,NotApplicable,0.0,2011,1,1,5
...,...,...,...,...,...,...,...
47086716,2015-04-18,NotApplicable,0.0,2015,2,4,5
47086715,2015-04-18,NotApplicable,0.0,2015,2,4,5
47086714,2015-04-18,NotApplicable,0.0,2015,2,4,5
47086726,2015-04-18,NotApplicable,0.0,2015,2,4,5


# 4. Train/ Validation Split

In [66]:
# Import TimeSeriesSplit from sklearn.model_selection 
from sklearn.model_selection import TimeSeriesSplit

In [67]:
# Initiate TimeSeriesSplit class with n_splits=5
tscv = TimeSeriesSplit(n_splits=5)

In [68]:
# Split df_clean_sample to train_data and val_data
for train_index, val_index in tscv.split(final_df_clean):
    train_data = final_df_clean.iloc[train_index]
    val_data = final_df_clean.iloc[val_index]

MemoryError: Unable to allocate 59.9 MiB for an array with shape (15702350, 1) and data type int32

In [17]:
# Save the target variable of train_data and val_data as y_train and y_val respectively
y_train = train_data['sales_revenue']
y_val = val_data['sales_revenue']

# 5. Baseline model

In [18]:
# Find the mean value of target variable and save as y_mean
y_mean = train_data['sales_revenue'].mean()

In [19]:
# Create a numpy array with the same dimensions as for the train_data called y_base filled with this value
y_base = np.full(y_train.shape, y_mean)

In [20]:
# Import mean_squared_error from sklearn.metrics
from sklearn.metrics import mean_squared_error as mse

In [21]:
# Print the recall score of this baseline model on the training dataset
print(mse(y_train, y_base, squared=False))

9.097263900281192


# 6. Train and evaluate XGBoost model

In [22]:
# Import Pipeline from sklearn.pipeline, import StandardScaler from sklearn.preprocessing, import xgb as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

In [23]:
# Create a Pipeline called num_transformer with one step that contains StandardScaler
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [24]:
# Create a list called num_cols that will contain the list of columns that are numeric type
num_cols = ['year', 'quarter', 'month', 'day_of_week', 'lag7days']

In [25]:
# Import ColumnTransformer from sklearn.compose
from sklearn.compose import ColumnTransformer

In [26]:
# Create a ColumnTransformer called preprocessor containing the following steps

# num_transformer for num_cols
# cat_transformer for cat_cols

preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols)
    ]
)

In [27]:
# Create a Pipeline called sgd_pipe that contains 2 steps preprocessor and another that instantiate a SGDClassifier with same parameters as previously
sgd_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('xgb', xgb.XGBRegressor())
    ]
)

In [28]:
# Fit xgb_pipe with train_data
sgd_pipe.fit(train_data, y_train)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  if is_sparse(data):


In [29]:
# Make predictions on train_data and save as y_train_preds
y_train_preds = sgd_pipe.predict(train_data)

In [30]:
# Make predictions on val_data and save as y_val_preds
y_val_preds = sgd_pipe.predict(val_data)

In [31]:
# Display the RMSE score on the training set
print(mse(y_train, y_train_preds, squared=False))

9.07647293440825


In [32]:
# Display the RMSE score on the testing set
print(mse(y_val, y_val_preds, squared=False))

10.013246967160121


In [33]:
# Import dump from joblib package and save sgd_pipe into models folder
from joblib import dump

dump(sgd_pipe,  '../models/forecasting/sgd_pipe.joblib')

['../models/forecasting/xgb_pipeline.joblib']