In [32]:
import os
import pandas as pd
import openpyxl
from datetime import datetime

# Directory paths
sales_mix_directory = 'sales_mix'

# Dictionary to hold the data
date_product_data = {}

In [34]:
# Process each file
for file_name in os.listdir(sales_mix_directory):
    if file_name.endswith('.xlsx'):
        file_path = os.path.join(sales_mix_directory, file_name)
        
        # Load the workbook and get the active sheet
        wb = openpyxl.load_workbook(file_path, data_only=True)
        ws = wb.active
        
        # Find the header row. Assuming 'Name' and 'Quantity Sold' could be in any column.
        header_row_idx = None
        for rowIndex, row in enumerate(ws.iter_rows(values_only=True), start=1):
            if 'Name' in row and 'Quantity Sold' in row:
                header_row_idx = rowIndex
                headers = [cell for cell in row if cell is not None]
                name_idx = headers.index('Name')
                quantity_sold_idx = headers.index('Quantity Sold')
                break
        
        if header_row_idx is None:
            raise ValueError(f"'Name' and 'Quantity Sold' columns not found in the file: {file_name}")
        
        # Parse the date from the filename
        date_part = ' '.join(file_name.split(' ')[2:4]).replace(' - Copy', '')
        
        # Initialize the dictionary for the day if not already done
        if date_part not in date_product_data:
            date_product_data[date_part] = {}
            
        # Extracting data for each product
        for row in ws.iter_rows(min_row=header_row_idx+1, values_only=True):
            product_name = row[name_idx]
            quantity_sold = row[quantity_sold_idx] or 0  # Use 0 if None
            
            # Skip products based on names to remove or containing specific patterns
            names_to_remove = [
                "Total", "Smoothie", "Classic", "Hi Protein", "Spirit", "Superfood Plus",
                "Superfood", "Refresh", "Combo", "Regular Combo", "Snack Combo",
                "Ingredients (Smoothie)", "NO BOOSTER", "No Froyo", "No Yogurt",
                "Secret/Feature", "Specialty", "Shot", "Condiments (Booster Ball)",
                "Booster Ball", "Booster Blends", "Grilled Fresh", "Lunch", "Breakfast",
                "Merchandise", "Retail", "Fresh Juice", "Condiments (Fresh Juice)",
                "Instructions", "Condiments (Inst.)", "ALLERGY: tree nuts",
                "No Raspberry", "Special Prep", "whole wheat", "Allergy: ", "Split In 2 Cups", "Don't Make", "Almighty Acai Blend", "Condiments (Retail/Merchandise)"
, "Chipotle"                ]
            
            if any(pattern.lower() in product_name.lower() for pattern in ['allergy', 'combo', '$', 'add', 'No']):
                continue
            if product_name in names_to_remove:
                continue
            
            # Add the quantity sold to the dictionary
            date_product_data[date_part][product_name] = date_product_data[date_part].get(product_name, 0) + quantity_sold



In [35]:
# Convert the dictionary to a DataFrame and transpose
product_sales_df = pd.DataFrame.from_dict(date_product_data, orient='columns')


In [36]:
# Filter out unwanted product names and transpose the DataFrame
product_sales_df = product_sales_df.drop(index=names_to_remove, errors='ignore').transpose()



In [38]:
# Now save the transposed DataFrame
output_excel_path = 'sales_mix_clean_open/combined_sales_data.xlsx'
product_sales_df.to_excel(output_excel_path, index_label='Date')


In [39]:
# Convert the DataFrame to JSON, with product names as keys (index)
output_json = product_sales_df.to_json(orient='index')

In [40]:
# Save the JSON to a file
output_json_path = 'sales_mix_clean_open/combined_sales_data.json'
with open(output_json_path, 'w') as f:
    f.write(output_json)

# Output the path of the new JSON file
output_json_path

'sales_mix_clean_open/combined_sales_data.json'