In [3]:
import pandas as pd
import numpy as np      
from datetime import timedelta
import random
import os

In [4]:
df_0 = pd.read_csv(r'C:\Users\Agishan\OneDrive - University of Waterloo\Getrid\334\Final\201904_sales_reciepts.csv')

# Convert the 'transaction_date' from string to datetime
df_0['transaction_date'] = pd.to_datetime(df_0['transaction_date'])

# Subtract 5 months from the 'transaction_date'
df_0['transaction_date'] = df_0['transaction_date'] - pd.DateOffset(months=4)

sales_data = df_0[[
    'transaction_date',
    'quantity',
    'product_id',
]]

sales_data

Unnamed: 0,transaction_date,quantity,product_id
0,2018-12-01,1,52
1,2018-12-01,2,27
2,2018-12-01,2,46
3,2018-12-01,2,23
4,2018-12-01,1,34
...,...,...,...
49889,2018-12-29,1,30
49890,2018-12-29,2,25
49891,2018-12-29,1,31
49892,2018-12-29,2,44


In [5]:
# Mapping based on OCR results, corrected for obvious OCR mistakes
product_mapping = {
    87: 'Ouro Brasileiro Shot',
    34: 'Jamaican Coffee', 35: 'Jamaican Coffee', 36: 'Jamaican Coffee',
    44: 'Peppermint Tea', 45: 'Peppermint Tea',
    50: 'Earl Grey Tea', 51: 'Earl Grey Tea',
    40: 'Cappuccino', 41: 'Cappuccino'
}

# Assuming 'sales_data' is your pandas DataFrame containing the sales data
# You would first filter the DataFrame for rows where product_id is in the mapping
filtered_sales_data = sales_data[sales_data['product_id'].isin(product_mapping.keys())]

# Then you would replace the product_id with the corresponding product name
filtered_sales_data['product_name'] = filtered_sales_data['product_id'].map(product_mapping)

# Now drop the original 'product_id' column as it's no longer needed
filtered_sales_data = filtered_sales_data.drop('product_id', axis=1)

# This resulting dataframe 'filtered_sales_data' will have the 'product_name' column
# with product names instead of IDs and will only contain the filtered IDs.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_sales_data['product_name'] = filtered_sales_data['product_id'].map(product_mapping)


In [6]:
df_agg = filtered_sales_data.groupby(['product_name','transaction_date']).sum()['quantity'].reset_index()
dfs = []
products = (df_agg['product_name'].unique()).tolist()
for product in products:
    temp = df_agg[df_agg['product_name'] == product]
    dfs.append(temp)


In [7]:
#SIMULATION

# Function to calculate weekly seasonality pattern for a given dataframe
def calculate_weekly_pattern(df):
    data = df.copy()
    data['transaction_date'] = pd.to_datetime(data['transaction_date'])
    data['day_of_week'] = data['transaction_date'].dt.dayofweek
    weekly_pattern = data.groupby('day_of_week')['quantity'].mean()
    
    # Normalize the pattern to make the average across the week equal to 1
    weekly_pattern = weekly_pattern / weekly_pattern.mean()
    
    return weekly_pattern.tolist()

def simulate_sales_data(df, base_sales=50, growth_factor=1.5, randomness_scale=0.15, spike_probability=0.1, spike_scale=0.32):
    data = df.copy()  # Work with a copy to avoid modifying the original DataFrame
    
    # Convert 'transaction_date' to datetime
    data['transaction_date'] = pd.to_datetime(data['transaction_date'])
    
    # Calculate the actual weekly seasonality pattern from the original data
    weekly_pattern = calculate_weekly_pattern(data)
    
    # Generate dates for the next 6 months
    last_date = data['transaction_date'].max()
    dates = pd.date_range(start=last_date + timedelta(days=1), periods=6*30, freq='D')
    
    # Prepare a dataframe for simulated data
    simulated_data = pd.DataFrame({'transaction_date': dates})
    
    # Extend the actual weekly pattern across our date range
    seasonal_multipliers = np.tile(weekly_pattern, len(dates) // 7 + 1)[:len(dates)]

    # Apply a linear growth trend across the 6 months
    linear_growth_trend = np.linspace(1, growth_factor, len(dates))
    
    # Combine base sales, seasonal pattern, and linear growth trend
    simulated_sales = base_sales * seasonal_multipliers * linear_growth_trend
    
    # Add day-to-day random fluctuations
    daily_fluctuation = np.random.normal(1, randomness_scale, len(dates))
    simulated_sales *= daily_fluctuation
    
    # Introduce random spikes or drops in sales
    for i in range(len(simulated_sales)):
        if random.random() < spike_probability:  # Chance of a random spike/drop occurring
            simulated_sales[i] *= np.random.normal(1, spike_scale)
    
    # Add simulated sales to the dataframe
    simulated_data['quantity'] = simulated_sales
    simulated_data['quantity'] = simulated_data['quantity'].round().astype(int)
    
    return simulated_data  # Return the simulated DataFrame

# Example usage with an existing dataframe to simulate additional data
simulated_dfs = [simulate_sales_data(df) for df in dfs]


In [8]:
# Iterate over the list of dataframes and their corresponding product IDs
output_dir = r'C:\Users\Agishan\OneDrive - University of Waterloo\Getrid\334\Final\SimulatedOrders'

for df, products in zip(simulated_dfs, products):
    # Define the path for the CSV file
    file_path = os.path.join(output_dir, f'simulated_orders_{products}.csv')
    
    # Save the DataFrame to a CSV file
    df.to_csv(file_path, index=False)
        
    print(f"All files have been saved in the '{output_dir}' directory.")

All files have been saved in the 'C:\Users\Agishan\OneDrive - University of Waterloo\Getrid\334\Final\SimulatedOrders' directory.
All files have been saved in the 'C:\Users\Agishan\OneDrive - University of Waterloo\Getrid\334\Final\SimulatedOrders' directory.
All files have been saved in the 'C:\Users\Agishan\OneDrive - University of Waterloo\Getrid\334\Final\SimulatedOrders' directory.
All files have been saved in the 'C:\Users\Agishan\OneDrive - University of Waterloo\Getrid\334\Final\SimulatedOrders' directory.
All files have been saved in the 'C:\Users\Agishan\OneDrive - University of Waterloo\Getrid\334\Final\SimulatedOrders' directory.
