# 1. Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load combined_df from ../data/processed
combined_df = pd.read_csv('../data/processed/combined_data.csv')

# 2. Data Preprocessing

In [3]:
# Make a copy of combined_df and save as combined_df_clean
combined_df_clean = combined_df.copy()

In [4]:
# Replace NaN values in column 'sell_price' with value 0
combined_df_clean['sell_price'] = combined_df_clean['sell_price'].replace(np.nan, 0, regex=True)

In [5]:
# Create a column named sales_revenue
combined_df_clean['sales_revenue'] = combined_df_clean['sales_qty'] * combined_df_clean['sell_price']

In [6]:
# Drop 'item_id', 'store_id', 'd', 'sales_qty', 'wm_yr_wk', 'sell_price' in combined_df_clean
combined_df_clean = combined_df_clean.drop(['item_id', 'store_id', 'd', 'sales_qty', 'wm_yr_wk', 'sell_price'], axis=1)
combined_df_clean

Unnamed: 0,date,sales_revenue
0,2011-01-29,0.00
1,2011-01-29,0.00
2,2011-01-29,0.00
3,2011-01-29,0.00
4,2011-01-29,0.00
...,...,...
46985085,2015-04-18,5.76
46985086,2015-04-18,0.00
46985087,2015-04-18,0.00
46985088,2015-04-18,0.00


In [7]:
combined_df_clean.isnull().sum()

date             0
sales_revenue    0
dtype: int64

In [8]:
# Downsample df_clean by using every 5th row and save as df_clean_sample
df_clean_sample = combined_df_clean.iloc[::5, :]

In [9]:
# Set date column to be dataframe index
df_clean_sample.set_index('date')

Unnamed: 0_level_0,sales_revenue
date,Unnamed: 1_level_1
2011-01-29,0.00
2011-01-29,0.00
2011-01-29,0.00
2011-01-29,3.50
2011-01-29,0.00
...,...
2015-04-18,0.00
2015-04-18,0.00
2015-04-18,4.50
2015-04-18,0.00


In [10]:
# Convert the 'date' column to a datetime type
df_clean_sample['date'] = pd.to_datetime(df_clean_sample['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sample['date'] = pd.to_datetime(df_clean_sample['date'])


# 3. Create time series and lag features

In [11]:
# Import datetime as dt
import datetime as dt

In [12]:
# Create features 'year', 'quarter', 'month', 'day_of_week', 'weekofyear', 'dayofyear', 'dayofmonth' from column 'date'
df_clean_sample['year'] = df_clean_sample['date'].dt.year
df_clean_sample['quarter'] = df_clean_sample['date'].dt.quarter
df_clean_sample['month'] = df_clean_sample['date'].dt.month
df_clean_sample['day_of_week'] = df_clean_sample['date'].dt.dayofweek  # 0=Monday, 1=Tuesday, ..., 6=Sunday
df_clean_sample['weekofyear'] = df_clean_sample['date'].dt.isocalendar().week
df_clean_sample['dayofyear'] = df_clean_sample['date'].dt.dayofyear
df_clean_sample['dayofmonth'] = df_clean_sample['date'].dt.day

df_clean_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sample['year'] = df_clean_sample['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sample['quarter'] = df_clean_sample['date'].dt.quarter
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sample['month'] = df_clean_sample['date'].dt.month
A value is trying to be

Unnamed: 0,date,sales_revenue,year,quarter,month,day_of_week,weekofyear,dayofyear,dayofmonth
0,2011-01-29,0.00,2011,1,1,5,4,29,29
5,2011-01-29,0.00,2011,1,1,5,4,29,29
10,2011-01-29,0.00,2011,1,1,5,4,29,29
15,2011-01-29,3.50,2011,1,1,5,4,29,29
20,2011-01-29,0.00,2011,1,1,5,4,29,29
...,...,...,...,...,...,...,...,...,...
46985065,2015-04-18,0.00,2015,2,4,5,16,108,18
46985070,2015-04-18,0.00,2015,2,4,5,16,108,18
46985075,2015-04-18,4.50,2015,2,4,5,16,108,18
46985080,2015-04-18,0.00,2015,2,4,5,16,108,18


In [13]:
# Create lag features based on time series index.
target_map = df_clean_sample['sales_revenue'].to_dict()
df_clean_sample['lag1'] = (df_clean_sample['date'] - pd.Timedelta('7 days')).map(target_map)
df_clean_sample['lag2'] = (df_clean_sample['date'] - pd.Timedelta('14 days')).map(target_map)
df_clean_sample['lag3'] = (df_clean_sample['date'] - pd.Timedelta('28 days')).map(target_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sample['lag1'] = (df_clean_sample['date'] - pd.Timedelta('7 days')).map(target_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sample['lag2'] = (df_clean_sample['date'] - pd.Timedelta('14 days')).map(target_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_sample['l

# 4. Train/ Validation Split

In [14]:
# Import TimeSeriesSplit from sklearn.model_selection 
from sklearn.model_selection import TimeSeriesSplit

In [15]:
# Initiate TimeSeriesSplit class with n_splits=5
tscv = TimeSeriesSplit(n_splits=5)

In [16]:
# Split df_clean_sample to train_data and val_data
for train_index, val_index in tscv.split(df_clean_sample):
    train_data = df_clean_sample.iloc[train_index]
    val_data = df_clean_sample.iloc[val_index]

In [17]:
# Save the target variable of train_data and val_data as y_train and y_val respectively
y_train = train_data['sales_revenue']
y_val = val_data['sales_revenue']

# 5. Baseline model

In [18]:
# Find the mean value of target variable and save as y_mean
y_mean = train_data['sales_revenue'].mean()

In [19]:
# Create a numpy array with the same dimensions as for the train_data called y_base filled with this value
y_base = np.full(y_train.shape, y_mean)

In [20]:
# Import mean_squared_error from sklearn.metrics
from sklearn.metrics import mean_squared_error as mse

In [21]:
# Print the recall score of this baseline model on the training dataset
print(mse(y_train, y_base, squared=False))

9.097263900281192


# 6. Train and evaluate XGBoost model

In [46]:
# Import Pipeline from sklearn.pipeline, import StandardScaler from sklearn.preprocessing, import xgb as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [47]:
# Create a Pipeline called num_transformer with one step that contains StandardScaler
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [48]:
# Create a Pipeline called cat_transformer with one step that contains OneHotEncoder
# cat_transformer = Pipeline(
#     steps=[
#         ('one_hot_encoder', OneHotEncoder())
#     ]
# )

In [49]:
# Create a list called num_cols that will contain the list of columns that are numeric type
num_cols = ['year', 'quarter', 'month', 'weekofyear', 'dayofyear', 'dayofmonth', 'day_of_week', 'lag1', 'lag2', 'lag3']

In [50]:
# cat_cols = ['item_id', 'store_id']

In [51]:
# Import ColumnTransformer from sklearn.compose
from sklearn.compose import ColumnTransformer

In [52]:
# Create a ColumnTransformer called preprocessor containing the following steps

# num_transformer for num_cols
# cat_transformer for cat_cols

preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols)
    ]
)

In [53]:
# Create a Pipeline called sgd_pipe that contains 2 steps preprocessor and another that instantiate a SGDClassifier with same parameters as previously
xgb_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('xgb', xgb.XGBRegressor())
    ]
)

In [54]:
# Fit xgb_pipe with train_data
xgb_pipe.fit(train_data, y_train)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  if is_sparse(data):


In [55]:
# Make predictions on train_data and save as y_train_preds
y_train_preds = xgb_pipe.predict(train_data)

In [56]:
# Make predictions on val_data and save as y_val_preds
y_val_preds = xgb_pipe.predict(val_data)

In [57]:
# Display the RMSE score on the training set
print(mse(y_train, y_train_preds, squared=False))

9.072826972278389


In [58]:
# Display the RMSE score on the testing set
print(mse(y_val, y_val_preds, squared=False))

10.008304662516279


In [59]:
# Import dump from joblib package and save sgd_pipe into models folder
from joblib import dump

dump(xgb_pipe,  '../models/forecasting/xgb_pipeline.joblib')

['../models/forecasting/xgb_pipeline.joblib']