In [15]:
import os
import pandas as pd
import openpyxl
from datetime import datetime

# Directory paths
sales_mix_directory = 'sales_mix'

# Dictionary to hold the data
date_product_data = {}

In [16]:
# Process each file
for file_name in os.listdir(sales_mix_directory):
    if file_name.endswith('.xlsx'):
        file_path = os.path.join(sales_mix_directory, file_name)
        
        # Load the workbook and get the active sheet
        wb = openpyxl.load_workbook(file_path, data_only=True)
        ws = wb.active
        
        # Find the header row. Assuming 'Name' and 'Quantity Sold' could be in any column.
        header_row_idx = None
        for rowIndex, row in enumerate(ws.iter_rows(values_only=True), start=1):
            if 'Name' in row and 'Quantity Sold' in row:
                header_row_idx = rowIndex
                headers = [cell for cell in row if cell is not None]
                name_idx = headers.index('Name')
                quantity_sold_idx = headers.index('Quantity Sold')
                break
        
        if header_row_idx is None:
            raise ValueError(f"'Name' and 'Quantity Sold' columns not found in the file: {file_name}")
        
        # Parse the date from the filename
        date_part = ' '.join(file_name.split(' ')[2:4]).replace(' - Copy', '')
        
        # Initialize the dictionary for the day if not already done
        if date_part not in date_product_data:
            date_product_data[date_part] = {}
            
        # Extracting data for each product
        for row in ws.iter_rows(min_row=header_row_idx+1, values_only=True):
            product_name = row[name_idx]
            quantity_sold = row[quantity_sold_idx] or 0  # Use 0 if None
            
            # Skip products based on names to remove or containing specific patterns
            names_to_remove = [
                "Total", "Smoothie", "Classic", "Hi Protein", "Spirit", "Superfood Plus",
                "Superfood", "Refresh", "Combo", "Regular Combo", "Snack Combo",
                "Ingredients (Smoothie)", "NO BOOSTER", "No Froyo", "No Yogurt",
                "Secret/Feature", "Specialty", "Shot", "Condiments (Booster Ball)",
                "Booster Ball", "Booster Blends", "Grilled Fresh", "Lunch", "Breakfast",
                "Merchandise", "Retail", "Fresh Juice", "Condiments (Fresh Juice)",
                "Instructions", "Condiments (Inst.)", "ALLERGY: tree nuts",
                "No Raspberry", "Special Prep", "whole wheat", "Allergy: ", "Split In 2 Cups", "Don't Make", "Almighty Acai Blend", "Condiments (Retail/Merchandise)"
, "Chipotle"                ]
            
            if any(pattern.lower() in product_name.lower() for pattern in ['allergy', 'combo', '$', 'add', 'No']):
                continue
            if product_name in names_to_remove:
                continue
            
            # Add the quantity sold to the dictionary
            date_product_data[date_part][product_name] = date_product_data[date_part].get(product_name, 0) + quantity_sold


  warn("Workbook contains no default style, apply openpyxl's default")


In [48]:
# Convert the dictionary to a DataFrame and transpose
product_sales_df = pd.DataFrame.from_dict(date_product_data, orient='columns')

In [49]:
# Filter out unwanted product names and transpose the DataFrame
product_sales_df = product_sales_df.drop(index=names_to_remove, errors='ignore').transpose()

# product_sales_df.index = pd.to_datetime(product_sales_df.index + ', 2024', format='%b %d, %Y')


In [50]:
product_sales_df.index = pd.to_datetime(product_sales_df.index + ', 2024', format='%b %d, %Y')


In [55]:
def add_correct_year(date_str):
    date = pd.to_datetime(date_str)
    if date.month == 12:
        return date.replace(year=2023)
    else:
        return date.replace(year=2024)

# Apply the function to the index
product_sales_df.index = pd.to_datetime(product_sales_df.index)  # Ensure the index is in datetime format
product_sales_df.index = product_sales_df.index.map(add_correct_year)

# Sort the dataframe by index (date)
product_sales_df.sort_index(inplace=True)

# Now you can print or save your sorted DataFrame
print(product_sales_df)


            Nuttin' Better - R  Banana's A Whey - R  Ripped Berry - R  \
2023-12-28                 5.0                 12.0              11.0   
2023-12-29                17.0                 10.0               7.0   
2023-12-30                 8.0                  9.0               8.0   
2023-12-31                 9.0                  4.0               7.0   
2024-01-01                 8.0                  8.0              11.0   
2024-01-02                11.0                  7.0               7.0   
2024-01-03                11.0                  7.0              12.0   
2024-01-04                15.0                 12.0              14.0   
2024-01-05                 8.0                  9.0              10.0   
2024-01-06                 6.0                  9.0               5.0   
2024-01-07                 9.0                  4.0               7.0   
2024-01-08                11.0                  8.0              15.0   
2024-01-09                20.0                  8.0

In [56]:
# Now save the transposed DataFrame
output_excel_path = 'sales_mix_clean_open/combined_sales_data.csv'
product_sales_df.to_csv(output_excel_path, index_label='Date')

In [57]:
def add_correct_year(date_str):
    date = pd.to_datetime(date_str)
    if date.month == 12:
        return date.replace(year=2023)
    else:
        return date.replace(year=2024)

# Apply the function to the index
product_sales_df.index = pd.to_datetime(product_sales_df.index)  # Ensure the index is in datetime format
product_sales_df.index = product_sales_df.index.map(add_correct_year)

# Sort the dataframe by index (date)
product_sales_df.sort_index(inplace=True)

# Now you can print or save your sorted DataFrame
print(product_sales_df)


            Nuttin' Better - R  Banana's A Whey - R  Ripped Berry - R  \
2023-12-28                 5.0                 12.0              11.0   
2023-12-29                17.0                 10.0               7.0   
2023-12-30                 8.0                  9.0               8.0   
2023-12-31                 9.0                  4.0               7.0   
2024-01-01                 8.0                  8.0              11.0   
2024-01-02                11.0                  7.0               7.0   
2024-01-03                11.0                  7.0              12.0   
2024-01-04                15.0                 12.0              14.0   
2024-01-05                 8.0                  9.0              10.0   
2024-01-06                 6.0                  9.0               5.0   
2024-01-07                 9.0                  4.0               7.0   
2024-01-08                11.0                  8.0              15.0   
2024-01-09                20.0                  8.0

In [52]:
# Convert the transposed DataFrame to JSON
json_result = product_sales_df.to_json(orient='columns', date_format='iso')

# Write the JSON output to a file
json_output_path = 'sales_mix_clean_open/combined_sales_data.json'
with open(json_output_path, 'w') as json_file:
    json_file.write(json_result)

In [53]:
print(product_sales_df.index.dtype)
print("_________________________")
print(product_sales_df.head().dtypes)




datetime64[ns]
_________________________
Nuttin' Better - R               float64
Banana's A Whey - R              float64
Ripped Berry - R                 float64
Strawberry Storm - R             float64
High Impact Acai - R             float64
                                  ...   
Chipotle Steak Panini            float64
Wildberry Rush - S               float64
Go Mango - R                     float64
Canadian Maple - Booster Ball    float64
Grilled Cheese                   float64
Length: 92, dtype: object
