# Dough Flow Model

## Loading Data Set

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

data_set = pd.read_csv('dataset/bakery_sales_revised.csv', header = 0)

In [49]:
# rename some columns
data_set = data_set.rename(columns = {
    "Transaction" : "transaction",
    "Item" : "item"
})

print(data_set)
print(data_set.describe())

       transaction           item           date_time period_day  \
0                1          Bread 2016-10-30 09:58:00    morning   
1                2   Scandinavian 2016-10-30 10:05:00    morning   
2                2   Scandinavian 2016-10-30 10:05:00    morning   
3                3  Hot chocolate 2016-10-30 10:07:00    morning   
4                3            Jam 2016-10-30 10:07:00    morning   
...            ...            ...                 ...        ...   
20502         9682         Coffee 2017-04-09 14:32:00  afternoon   
20503         9682            Tea 2017-04-09 14:32:00  afternoon   
20504         9683         Coffee 2017-04-09 14:57:00  afternoon   
20505         9683         Pastry 2017-04-09 14:57:00  afternoon   
20506         9684      Smoothies 2017-04-09 15:04:00  afternoon   

      weekday_weekend        date  day_of_week  month  year  
0             weekend  2016-10-30            6     10  2016  
1             weekend  2016-10-30            6     10  2016

In [56]:
data_set['date_time'] = pd.to_datetime(data_set['date_time'])

# extracting date features 
data_set['date'] = data_set['date_time'].dt.date
data_set['day_of_week'] = data_set['date_time'].dt.dayofweek
data_set['month'] = data_set['date_time'].dt.month
data_set['year'] = data_set['date_time'].dt.year

# get daily counts of each item 
daily_sales = data_set.groupby(['date', 'item']).size().reset_index(name='quantity')

# get the top 5 most best sellers
top_5_items = daily_sales.groupby('item')['quantity'].sum().nlargest(5).index

print("Top 5 most sold items ", top_5_items.tolist())

# Filter only 5 best sellers 
daily_sales_top_items = daily_sales[daily_sales['item'].isin(top_5_items)]

# Pivot table to make items as columns and date as rows with values as quantity
daily_sales_pivot = daily_sales_top_items.pivot_table(index='date', columns='item', values='quantity').fillna(0)

# Feature setting
daily_sales_pivot.index = pd.to_datetime(daily_sales_pivot.index)
daily_sales_pivot['day_of_week'] = daily_sales_pivot.index.dayofweek
daily_sales_pivot['month'] = daily_sales_pivot.index.month
daily_sales_pivot['year'] = daily_sales_pivot.index.year
daily_sales_pivot['day_of_year'] = daily_sales_pivot.index.dayofyear
daily_sales_pivot['week_of_year'] = daily_sales_pivot.index.isocalendar().week.astype(int)

# Preparing data for modeling each item 
for item in top_5_items:
    print("\nTraining model for", item)

    item_df = daily_sales_pivot[[item, 'day_of_week', 'month', 'year', 'day_of_year', 'week_of_year']].copy()
    item_df.rename(columns = {item: 'quantity'}, inplace = True)

    # setting lag features
    for i in range(1,8):
        item_df[f"lag_{i}"] = item_df['quantity'].shift(i)

    item_df.dropna(inplace=True)

    # Split data
    X = item_df.drop('quantity', axis=1)
    y = item_df['quantity']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle= False)

    # Model training
    model = RandomForestRegressor(n_estimators=100,random_state=42)
    model.fit(X_train, y_train)

    # Evaluation 
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions) # mean absolute error
    rmse = np.sqrt(mean_squared_error(y_test, predictions)) # root mean square error
    print(f"Mean absolute error for {item}: {mae}")
    print(f"Root mean square error for {item}: {rmse}")

    # Forecasting the next day
    last_day_features = X.iloc[-1:].copy()
    last_day_features['day_of_week'] = (last_day_features['day_of_week'] + 1) % 7
    last_day_features['day_of_year'] = last_day_features['day_of_year'] + 1
    if last_day_features['day_of_year'].iloc[0] > 365:
        last_day_features['day_of_year'] = 1
        last_day_features['year'] = last_day_features['year'] + 1
        last_day_features['month'] = 1

    last_day_sales = y.iloc[-1]
    for i in range(1,8):
        if i == 1:
            last_day_features[f"lag_{i}"] = last_day_sales
        else: 
            last_day_features[f'lag_{i}'] = X.iloc[-1][f'lag_{i-1}']

    next_day_forecast = model.predict(last_day_features)
    print(f'Forecasted quantity for {item} for the next day: {int(round(next_day_forecast[0]))}')

    # Plot feature importances
    feature_importances = pd.Series(model.feature_importances_, index = X.columns)
    plt.figure(figsize=(10,6))
    sns.barplot(x=feature_importances, y = feature_importances.index)
    plt.title(f'Feature importances for {item}')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.savefig(f'feature_importance_{item.replace(" ", "_")}.png')
    plt.close()

    # Plot predictions vs actual
    plt.figure(figsize=(12,6))
    plt.plot(y_test.index, y_test.values, label = 'Actual')
    plt.plot(y_test.index, predictions, label = 'Forecasted')
    plt.title(f'Actual vs Forecasted Sales for {item}')
    plt.xlabel('Date')
    plt.ylabel('Quantity Sold')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'prediction_vs_actual_{item.replace(" ", "_")}.png')
    plt.close()



Top 5 most sold items  ['Coffee', 'Bread', 'Tea', 'Cake', 'Pastry']

Training model for Coffee
Mean absolute error for Coffee: 4.998387096774194
Root mean square error for Coffee: 6.797072187536385
Forecasted quantity for Coffee for the next day: 29

Training model for Bread
Mean absolute error for Bread: 5.276129032258065
Root mean square error for Bread: 6.3950888414722264
Forecasted quantity for Bread for the next day: 18

Training model for Tea
Mean absolute error for Tea: 2.8545161290322585
Root mean square error for Tea: 3.3575101181602403
Forecasted quantity for Tea for the next day: 8

Training model for Cake
Mean absolute error for Cake: 3.3609677419354838
Root mean square error for Cake: 4.066753877083563
Forecasted quantity for Cake for the next day: 4

Training model for Pastry
Mean absolute error for Pastry: 2.56741935483871
Root mean square error for Pastry: 3.274845970275893
Forecasted quantity for Pastry for the next day: 6
