In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the original dataset
aus_groc = pd.read_csv('OriginalDataset/Australia_Grocery_2022Sep.csv',index_col=0)

# Price randomizer over geolocation

In [None]:
# Prepare data for synthesis
prepare_syn = aus_groc[['Sku','city','state','Package_price']].sort_values(['Sku','city']).reset_index(drop=True)
group_sku_city = prepare_syn.groupby('Sku').count()[['city']]
group_sku_city

In [None]:
# Geolocation data randomzier
array_list = []
for _, row in group_sku_city.iterrows():
    # Create an array of zeros of the specified length
    num_rows = row['city']
    dummy_array = np.zeros(num_rows)
    dummy_array[1:] = np.random.uniform(-0.15, 0.15, num_rows - 1)
    dummy_array= np.around(dummy_array, decimals=2)
    array_list.append(dummy_array)

# Concatenate all the arrays into one big array
big_array = np.concatenate(array_list)
# Print the concatenated array's length and sample
print(f"Total length of the concatenated array: {len(big_array)}")

big_array

new_price = prepare_syn.Package_price + prepare_syn.Package_price * big_array

In [None]:
prepare_syn['Package_price'] = new_price
prepare_syn

# Price Changes over time

In [None]:
# Generate a range of dates for the simulation
start_date = pd.to_datetime("2022-11-09")
end_date = pd.to_datetime("2023-12-01")
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Define the drift (daily average upward trend)
daily_drift = 0.00013368061711349633  # Account for 5% inflation rate

# Select unique store locations
store_locations = prepare_syn["city"].unique()

In [None]:
# Create a new DataFrame to hold synthetic data for each store location
synthetic_data_location = pd.DataFrame()

# Create a DataFrame to store all synthetic data
synthetic_data_location = []

# Generate synthetic data for each unique SKU and location
for location in store_locations:
    # Filter data by location
    location_data = prepare_syn[prepare_syn["city"] == location]

    # Get unique SKUs for the location
    unique_skus = location_data["Sku"].unique()

    for sku in unique_skus:
        # Filter data for the specific SKU in the location
        sku_data = location_data[location_data["Sku"] == sku]

        if sku_data.empty:
            continue  # Skip if no data for this SKU

        # Base price to start the simulation
        base_price = sku_data["Package_price"].iloc[0]

        # Generate synthetic price data using a random walk
        prices = [base_price]
        for _ in range(len(date_range) - 1):
            change_percent = np.random.normal(daily_drift, 0.004)
            new_price = prices[-1] * (1 + change_percent)
            prices.append(max(new_price, 0))  # Ensure non-negative prices

        # Create a data frame for the synthetic data
        temp_df = pd.DataFrame({
            "Sku": sku,
            "city": location,
            "Package_price": prices,
            "RunDate": date_range
        })

        synthetic_data_location.append(temp_df)  # Store in a list

# Concatenate all synthetic data into a single data frame
synthetic_data_location = pd.concat(synthetic_data_location, ignore_index=True)


# Exporting to csv

In [None]:
synthetic_data_location.to_csv('SynDataset/syn_data_loc_time.csv')