In [None]:
import numpy as np
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import os



In [None]:
raw_files = os.listdir('data/rawpo/xlsx')
display(raw_files)

def convert_xlsx_to_csv(directory):
    """
    Convert all .xlsx files in the specified directory to .csv format.
    Skips files that already have a corresponding .csv file.
    
    Args:
        directory (str): Path to the directory containing .xlsx files
    """
    # Convert to Path object for easier handling
    dir_path = Path(directory)
    
    # Find all .xlsx files in the directory
    xlsx_files = list(dir_path.glob('*.xlsx'))
    
    if not xlsx_files:
        print(f"No .xlsx files found in {directory}")
        return
    
    print(f"Found {len(xlsx_files)} .xlsx files to process...")
    
    for xlsx_file in xlsx_files:
        # Create output filename with .csv extension
        csv_file = xlsx_file.with_suffix('.csv')
        
        # Skip if CSV already exists
        if csv_file.exists():
            print(f"Skipping {xlsx_file.name} - {csv_file.name} already exists")
            continue
            
        try:
            # Read the Excel file
            print(f"\n===========================================Converting {xlsx_file.name} to {csv_file.name}...")
            df = pd.read_excel(xlsx_file)
            
            # Write to CSV
            df.to_csv(csv_file, index=False, encoding='utf-8')
            print(f"Successfully created {csv_file.name}")
            
        except Exception as e:
            print(f"Error processing {xlsx_file.name}: {str(e)}")
    
    print("Conversion complete!")

# Example usage:
convert_xlsx_to_csv('data/rawpo/xlsx')

In [None]:
# Read the Excel file, converting 'INF' to numpy.inf
ori_df = pd.read_csv('data/rawpo/01 Miss Glam Padang.csv', sep=';', decimal=',')

# Convert all numeric columns, handling infinity and NaN values
for col in ori_df.select_dtypes(include=[np.number]).columns:
    ori_df[col] = pd.to_numeric(ori_df[col], errors='coerce')

df = ori_df.copy()
df = df.rename(columns={'Stok': 'Stock'})

pd.set_option('display.max_columns', None)

# extract only the columns we need
display(df.info())
df = df[['Brand', 'SKU', 'Nama', 'Stock', 'Daily Sales', 'Max. Daily Sales', 'Lead Time', 'Max. Lead Time', 'Sedang PO', 'Min. Order', 'HPP']]

display(df)

# contribution dictionary for each store location
contribution_dict = {
    'payakumbuh': 0.47,
}

In [None]:
# supplier mapping
# to map an SKU and brand to specific supplier

raw_supplier_df = pd.read_csv('data/supplier.csv', sep=';', decimal=',')
raw_supplier_df = raw_supplier_df.fillna('')

display(raw_supplier_df)

In [None]:
# First, convert object columns to numeric where possible
numeric_columns = ['Stock', 'Daily Sales', 'Lead Time', 'Max. Daily Sales', 'Max. Lead Time']

df_clean = df.copy()
display('Raw DataFrame: ', df)

# Convert all columns to numeric, coercing errors to NaN
for col in numeric_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Now fill NA with 0 and convert to int
df_clean = df_clean.fillna(0)

# For non-numeric columns, keep them as they are
non_numeric_columns = ['Brand', 'SKU']  # Add other non-numeric columns if needed
for col in non_numeric_columns:
    df_clean[col] = df[col]  # Keep original values

# add new column 'Lead Time Sedang PO' default to 2 days
df_clean['Lead Time Sedang PO'] = 5

# Display the cleaned DataFrame
print("Cleaned DataFrame:")
display(df_clean)

# Show info of the cleaned DataFrame
print("\nDataFrame Info: Expected to have maximal non-null values...")
df_clean.info()

### Add Supply Chain params for AutoPO

- Safety stock - (max sales x max lead time) - (avg sales x avg lead time)
- Reorder point - avg sales x avg lead time + safety stock
- Stock cover days (for 21 days) - avg sales x 21
- RoP_Reference (1 -> RoP > Stock cover days, 0 -> RoP < Stock cover days)
- Current stock days cover -> Current stock / avg sales
- Is_open_po (1 -> Current Stock < Reorder point, 0 -> otherwise)
- Initial_Qty_PO - Reorder point - Current stock

- Is_emergency_PO - 1 -> Current stock days cover <= max lead time

- Emergency_PO_Qty - (max lead time - Current stock days cover) x Avg sales

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)

# 1. Safety stock = (max sales x max lead time) - (avg sales x avg lead time)
df_clean['Safety stock'] = (df_clean['Max. Daily Sales'] * df_clean['Max. Lead Time']) - (df_clean['Daily Sales'] * df_clean['Lead Time'])
# round up safety stock
df_clean['Safety stock'] = df_clean['Safety stock'].apply(lambda x: np.ceil(x)).astype(int)

# 2. Reorder point = (avg sales x avg lead time) + safety stock
df_clean['Reorder point'] = np.ceil((df_clean['Daily Sales'] * df_clean['Lead Time']) + 
                                   df_clean['Safety stock']).astype(int)

# 3. Stock cover days (in Qty) for 30 days = avg sales x 30 
df_clean['Stock cover 30 days'] = df_clean['Daily Sales'] * 30
df_clean['Stock cover 30 days'] = df_clean['Stock cover 30 days'].apply(lambda x: np.ceil(x)).astype(int)

# 5. Current stock days cover (in days) = Current stock / avg sales
df_clean['current_stock_days_cover'] = (df_clean['Stock'].astype(float) * 1.0 ) / df_clean['Daily Sales'].astype(float)

# 6. Is_open_po (1 -> Current stock < Reorder point, 0 -> otherwise)
df_clean['is_open_po'] = np.where((df_clean['current_stock_days_cover'] <= 30) & (df_clean['Stock'] <= df_clean['Reorder point']), 1, 0)

# 7. Initial_Qty_PO = Stock cover 30 days - Current stock - sedang PO
df_clean['initial_qty_po'] = df_clean['Stock cover 30 days'] - df_clean['Stock'] - df_clean['Sedang PO']
df_clean['initial_qty_po'] = np.where(df_clean['is_open_po'] == 1, df_clean['initial_qty_po'], 0)
df_clean['initial_qty_po'] = df_clean['initial_qty_po'].apply(lambda x: x if x > 0 else 0).astype(int)

# 9. Emergency_PO_Qty = (max lead time - Current stock days cover) x Avg sales
# First, ensure 'Sedang PO' column exists and handle potential missing values
if 'Sedang PO' not in df_clean.columns:
    df_clean['Sedang PO'] = 0  # Default to 0 if column doesn't exist

# Calculate emergency_po_qty based on the condition
df_clean['emergency_po_qty'] = np.where(
    df_clean['Sedang PO'] > 0,  # If there is 'Sedang PO' quantity
    np.maximum(0, (df_clean['Lead Time Sedang PO'] - df_clean['current_stock_days_cover']) * 
              df_clean['Daily Sales']),
    # Else use the original formula
    np.ceil((df_clean['Max. Lead Time'] - df_clean['current_stock_days_cover']) * 
            df_clean['Daily Sales'])
)

# First, handle any infinite values and NaN values
df_clean['emergency_po_qty'] = (
    df_clean['emergency_po_qty']
    .replace([np.inf, -np.inf], 0)  # Replace infinities with 0
    .fillna(0)                      # Fill any remaining NaNs with 0
    .astype(int)                    # Now safely convert to integers
)

# If you want to ensure no negative values (since it's a quantity)
df_clean['emergency_po_qty'] = df_clean['emergency_po_qty'].clip(lower=0)

# calculate updated po quantity
df_clean['updated_regular_po_qty'] = df_clean['initial_qty_po'] - df_clean['emergency_po_qty']
df_clean['updated_regular_po_qty'] = df_clean['updated_regular_po_qty'].apply(lambda x: x if x > 0 else 0).astype(int)

# Final check updated regular PO - if less than Min. Order, use Min. Order qty
df_clean['final_updated_regular_po_qty'] = np.where((df_clean['updated_regular_po_qty'] > 0) & (df_clean['updated_regular_po_qty'] < df_clean['Min. Order']), df_clean['Min. Order'], df_clean['updated_regular_po_qty'])


# Calculate total cost (HPP * qty) for emergency PO and final updated regular PO
df_clean['total_cost_emergency_po'] = df_clean['emergency_po_qty'] * df_clean['HPP']
df_clean['total_cost_final_updated_regular_po'] = df_clean['final_updated_regular_po_qty'] * df_clean['HPP']

# Handle any NaN or infinite values by replacing them with 0
df_clean = df_clean.fillna(0)
df_clean = df_clean.replace([np.inf, -np.inf], 0)

# Display the updated DataFrame with new columns
df_clean

In [None]:
# Create output directory if it doesn't exist
os.makedirs('output', exist_ok=True)

# Export to CSV
csv_path = 'output/result.csv'
df_clean.to_csv(csv_path, index=False, sep=';', encoding='utf-8-sig')
print(f"CSV file saved to: {csv_path}")

### Mapping Brand and SKU with supplier (add supplier column)

In [None]:
import pandas as pd
import numpy as np

# Make copies to avoid modifying originals
df_clean_trimmed = df_clean.copy()
raw_supplier_trimmed = raw_supplier_df.copy()

# Trim whitespace from brand names
df_clean_trimmed['Brand'] = df_clean_trimmed['Brand'].str.strip()
raw_supplier_trimmed['Nama Brand'] = raw_supplier_trimmed['Nama Brand'].str.strip()

# First, get all Padang suppliers
padang_suppliers = raw_supplier_trimmed[
    raw_supplier_trimmed['Nama Store'] == 'Miss Glam Padang'
]

# Then get all other suppliers (non-Padang)
other_suppliers = raw_supplier_trimmed[
    raw_supplier_trimmed['Nama Store'] != 'Miss Glam Padang'
]

# Step 1: Left join with Padang suppliers first (priority)
merged_df = pd.merge(
    df_clean_trimmed,
    padang_suppliers,
    left_on='Brand',
    right_on='Nama Brand',
    how='left',
    suffixes=('_clean', '_supplier')
)

# Step 2: For rows without Padang supplier, try to find other suppliers
# Get the indices of rows that didn't get a match with Padang suppliers
no_padang_match = merged_df[merged_df['Nama Brand'].isna()].index

if len(no_padang_match) > 0:
    # Get the brands that need non-Padang suppliers
    brands_needing_suppliers = merged_df.loc[no_padang_match, 'Brand'].unique()
    
    # Get the first matching supplier for each brand (you can change this logic if needed)
    first_supplier_per_brand = other_suppliers.drop_duplicates(subset='Nama Brand')
    
    # Update the rows that didn't have Padang suppliers
    for brand in brands_needing_suppliers:
        supplier_data = first_supplier_per_brand[first_supplier_per_brand['Nama Brand'] == brand]
        if not supplier_data.empty:
            # Update the corresponding rows in merged_df
            brand_mask = (merged_df['Brand'] == brand) & (merged_df['Nama Brand'].isna())
            for col in supplier_data.columns:
                if col in merged_df.columns and col != 'Brand':  # Don't overwrite the Brand column
                    merged_df.loc[brand_mask, col] = supplier_data[col].values[0]

# Clean up: For any remaining NaN values in supplier columns, fill with empty string or as needed
supplier_columns = [
    'ID Supplier', 'Nama Supplier', 'ID Brand', 'ID Store', 
    'Nama Store', 'Hari Order', 'Min. Purchase', 'Trading Term',
    'Promo Factor', 'Delay Factor'
]

for col in supplier_columns:
    if col in merged_df.columns:
        if merged_df[col].dtype == 'object':
            merged_df[col] = merged_df[col].fillna('')
        else:
            merged_df[col] = merged_df[col].fillna(0)

# Show summary
print(f"Total rows in df_clean: {len(df_clean_trimmed)}")
print(f"Total rows after merge: {len(merged_df)}")

# Count how many rows got Padang suppliers vs other suppliers vs no suppliers
padang_count = (merged_df['Nama Store'] == 'Miss Glam Padang').sum()
other_supplier_count = ((merged_df['Nama Store'] != 'Miss Glam Padang') & 
                       (merged_df['Nama Store'] != '')).sum()
no_supplier = (merged_df['Nama Store'] == '').sum()

print(f"\nSuppliers matched:")
print(f"- 'Miss Glam Padang' suppliers: {padang_count} rows")
print(f"- Other suppliers: {other_supplier_count} rows")
print(f"- No supplier data: {no_supplier} rows")

# Save the result
os.makedirs('output', exist_ok=True)
output_path = 'output/merged_with_suppliers.csv'
merged_df.to_csv(output_path, index=False, sep=';', encoding='utf-8-sig')
print(f"\nResults saved to: {output_path}")

# Show a sample of the results
print("\nSample of merged data (first 5 rows):")
display(merged_df.head())

In [None]:
# Merge df_clean with raw_supplier_df to see all supplier matches
all_suppliers_merge = pd.merge(
    df_clean_trimmed,
    raw_supplier_trimmed,
    left_on='Brand',
    right_on='Nama Brand',
    how='left'
)

# Group by Brand and SKU to count unique suppliers
supplier_counts = all_suppliers_merge.groupby(['Brand', 'SKU'])['Nama Supplier'].nunique().reset_index()
supplier_counts.columns = ['Brand', 'SKU', 'Supplier_Count']

# Filter for brands/SKUs with multiple suppliers
multi_supplier_items = supplier_counts[supplier_counts['Supplier_Count'] > 1]

print(f"Found {len(multi_supplier_items)} brand/SKU combinations with multiple suppliers")
print("\nSample of items with multiple suppliers:")
display(multi_supplier_items.head())

# If you want to see the actual supplier details for these items
if not multi_supplier_items.empty:
    print("\nDetailed supplier information for multi-supplier items:")
    multi_supplier_details = all_suppliers_merge.merge(
        multi_supplier_items[['Brand', 'SKU']],
        on=['Brand', 'SKU']
    )
    display(multi_supplier_details[['Brand', 'SKU', 'Nama Supplier', 'Nama Store']].drop_duplicates().sort_values(['Brand', 'SKU']))

    # List of SKUs to check
skus_to_check = [
    '8995232702124',  # ACNEMED
    '8992821100293',  # ACNES
    '8992821100309',  # ACNES
    '8992821100323',  # ACNES
    '8992821100354'   # ACNES
]

# Convert SKUs to integers (since they appear as integers in df_clean)
skus_to_check = [int(sku) for sku in skus_to_check]

# Check if these SKUs exist in df_clean
found_skus = merged_df[merged_df['SKU'].isin(skus_to_check)]

if not found_skus.empty:
    print("Found matching SKUs in df_clean:")
    display(found_skus[['Brand', 'SKU', 'Nama']])
else:
    print("None of these SKUs were found in df_clean.")
    print("\nChecking if there are any similar SKUs...")
    
    # Check for any SKUs that contain these numbers
    for sku in skus_to_check:
        similar = merged_df[merged_df['SKU'].astype(str).str.contains(str(sku)[:8])]
        if not similar.empty:
            print(f"\nSKUs similar to {sku}:")
            display(similar[['Brand', 'SKU', 'Nama']])
    
    # Check the data types to ensure we're comparing correctly
    print("\nData type of SKU column:", merged_df['SKU'].dtype)
    print("Sample SKUs from df_clean:", merged_df['SKU'].head().tolist())

### Find brands who are missing suppliers

In [None]:
# Find brands in df_clean that don't have a match in raw_supplier_df
missing_brands = set(df_clean['Brand']) - set(raw_supplier_df['Nama Brand'].dropna().unique())

print(f"Number of brands in df_clean: {len(df_clean['Brand'].unique())}")
print(f"Number of brands in raw_supplier_df: {len(raw_supplier_df['Nama Brand'].unique())}")
print(f"\nNumber of brands missing supplier data: {len(missing_brands)}")
print("\nFirst 20 missing brands (alphabetical order):")
print(sorted(list(missing_brands))[:20])

# Count how many rows are affected per missing brand
missing_brand_counts = df_clean[df_clean['Brand'].isin(missing_brands)]['Brand'].value_counts()
print("\nTop 20 missing brands by row count:")
print(missing_brand_counts)

# Output grouped data per one brand and one SKU to separate files

In [None]:
import os
import pandas as pd

# Create output directory
output_dir = 'output_po'
os.makedirs(output_dir, exist_ok=True)

# Create a directory for brands without suppliers
no_supplier_dir = os.path.join(output_dir, '0_no_suppliers')
os.makedirs(no_supplier_dir, exist_ok=True)

# Function to sanitize folder names
def sanitize_folder_name(name):
    # Remove or replace invalid characters
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        name = name.replace(char, '_')
    return name.strip()

# Process each group
for (supplier_id, supplier_name, brand), group in final_df.groupby(['ID Supplier', 'Nama Supplier', 'Brand']):
    # Skip if no supplier (shouldn't happen as we replaced NaN with defaults)
    if pd.isna(supplier_id) or not supplier_name:
        # Save to no_supplier_dir
        brand_file = os.path.join(no_supplier_dir, f'{sanitize_folder_name(brand)}.csv')
        group.to_csv(brand_file, index=False, sep=';', encoding='utf-8-sig')
        continue
    
    # Create supplier directory
    supplier_dir = os.path.join(output_dir, f'{int(supplier_id)}_{sanitize_folder_name(supplier_name)}')
    os.makedirs(supplier_dir, exist_ok=True)
    
    # Save brand file
    brand_file = os.path.join(supplier_dir, f'{sanitize_folder_name(brand)}.csv')
    group.to_csv(brand_file, index=False, sep=';', encoding='utf-8-sig')

print("Data has been organized into supplier and brand-based folders in 'output_po'")

# Final batch process

In [25]:
NUMERIC_COLUMNS = [
    'HPP', 'Harga', 'Ranking', 'Grade', 'Terjual', 'Stok', 'Lost Days',
    'Velocity Capped', 'Daily Sales', 'Lead Time', 'Max. Daily Sales',
    'Max. Lead Time', 'Min. Order', 'Safety Stok', 'ROP', '3W Cover',
    'Sedang PO', 'Suggested', 'Amount', 'Promo Factor', 'Delay Factor',
    'Stock Cover', 'Days to Backup', 'Qty to Backup'
]

NA_VALUES = {
    'NAN', 'NA', '#N/A', 'NULL', 'NONE', '', '?', '-', 'INF', '-INF',
    '+INF', 'INFINITY', '-INFINITY', '1.#INF', '-1.#INF', '1.#QNAN'
}

def _patch_openpyxl_number_casting():
    """Ensure openpyxl won't crash when encountering NAN/INF in numeric cells."""
    print("Calling _patch_openpyxl_number_casting...")

    try:
        from openpyxl.worksheet import _reader

        original_cast = _reader._cast_number

        def _safe_cast_number(value):  # pragma: no cover - monkey patch
            if isinstance(value, str):
                if value.strip().upper() in NA_VALUES:
                    return 0
            try:
                return original_cast(value)
            except (ValueError, TypeError):
                return 0 if value in (None, '') else value

        _reader._cast_number = _safe_cast_number
    except Exception:
        # If patch fails we continue; runtime reader will still attempt default behaviour
        pass

In [None]:
# Cell 1: Import libraries and setup
import pandas as pd
from pathlib import Path
import os
from IPython.display import display
import locale
from locale import atof
import numpy as np
from openpyxl.styles import numbers
from io import StringIO
import subprocess

_patch_openpyxl_number_casting()

# Apply the formatting to numeric columns in your final output
def format_dataframe_display(df):
    # Make a copy to avoid SettingWithCopyWarning
    df_display = df.copy()
    
    # Apply formatting to numeric columns
    for col in df_display.select_dtypes(include=['int64', 'float64']).columns:
        df_display[col] = df_display[col].apply(
            lambda x: format_id_number(x, 2) if pd.notna(x) else x
        )
    
    return df_display

# Configuration
BASE_DIR = Path('/Users/andresuchitra/dev/missglam/autopo')
SUPPLIER_PATH = BASE_DIR / 'data/supplier.csv'
RAWPO_DIR = BASE_DIR / 'data/rawpo/csv'
RAWPO_XLSX_DIR = BASE_DIR / 'data/rawpo/xlsx'
STORE_CONTRIBUTION_PATH = BASE_DIR / 'data/store_contribution.csv'
OUTPUT_DIR = BASE_DIR / 'output/complete'
OUTPUT_EXCEL_DIR = BASE_DIR / 'output/excel'
OUTPUT_M2_DIR = BASE_DIR / 'output/m2'
OUTPUT_EMERGENCY_DIR = BASE_DIR / 'output/emergency'

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_EXCEL_DIR, exist_ok=True)
os.makedirs(OUTPUT_M2_DIR, exist_ok=True)
os.makedirs(OUTPUT_EMERGENCY_DIR, exist_ok=True)

def load_store_contribution(store_contribution_path):
    """Load and prepare store contribution data."""
    store_contrib = pd.read_csv(store_contribution_path, header=None, 
                              names=['store', 'contribution_pct'])
    # Convert store names to lowercase for case-insensitive matching
    store_contrib['store_lower'] = store_contrib['store'].str.lower()
    return store_contrib

def get_contribution_pct(location, store_contrib):
    """Get contribution percentage for a given location."""
    location_lower = location.lower()

    contrib_row = store_contrib[store_contrib['store_lower'] == location_lower]
    if not contrib_row.empty:
        return contrib_row['contribution_pct'].values[0]
    print(f"Warning: No contribution percentage found for {location}")

    return 100  # Default to 100% if not found

def load_supplier_data(supplier_path):
    """Load and clean supplier data."""
    print("Loading supplier data...")
    df = pd.read_csv(supplier_path, sep=';', decimal=',').fillna('')
    df['Nama Brand'] = df['Nama Brand'].str.strip()
    return df

def merge_with_suppliers(df_clean, supplier_df):
    """Merge PO data with supplier information."""
    print("Merging with suppliers...")
    
    # Clean supplier data
    supplier_clean = supplier_df.copy()
    supplier_clean['Nama Brand'] = supplier_clean['Nama Brand'].astype(str).str.strip()
    supplier_clean['Nama Store'] = supplier_clean['Nama Store'].astype(str).str.strip()
    
    # Deduplicate to prevent row explosion - Unique Brand+Store
    supplier_clean = supplier_clean.drop_duplicates(subset=['Nama Brand', 'Nama Store'])
    
    # Ensure PO data has clean columns for merging
    df_clean['Brand'] = df_clean['Brand'].astype(str).str.strip()
    df_clean['Toko'] = df_clean['Toko'].astype(str).str.strip()
    
    # 1. Primary Merge: Match on Brand AND Store (Toko)
    # This prioritizes the specific supplier for that store
    merged_df = pd.merge(
        df_clean,
        supplier_clean,
        left_on=['Brand', 'Toko'],
        right_on=['Nama Brand', 'Nama Store'],
        how='left',
        suffixes=('_clean', '_supplier')
    )
    
    # 2. Fallback: For unmatched rows, try to find ANY supplier for that Brand
    # Identify rows where merge failed (Nama Brand is NaN)
    unmatched_mask = merged_df['Nama Brand'].isna()
    
    if unmatched_mask.any():
        print(f"Found {unmatched_mask.sum()} rows without direct store match. Attempting fallback...")
        
        # Get the unmatched rows and drop the empty supplier columns
        unmatched_rows = merged_df[unmatched_mask].copy()
        supplier_cols = [col for col in supplier_clean.columns if col in unmatched_rows.columns and col != 'Brand']
        unmatched_rows = unmatched_rows.drop(columns=supplier_cols)
        
        # Create fallback supplier list (one per brand)
        # We take the first one found for each brand
        fallback_suppliers = supplier_clean.drop_duplicates(subset=['Nama Brand'])
        
        # Merge unmatched rows with fallback suppliers
        matched_fallback = pd.merge(
            unmatched_rows,
            fallback_suppliers,
            left_on='Brand',
            right_on='Nama Brand',
            how='left',
            suffixes=('_clean', '_supplier')
        )
        
        # Combine the initially matched rows with the fallback-matched rows
        matched_initial = merged_df[~unmatched_mask]
        merged_df = pd.concat([matched_initial, matched_fallback], ignore_index=True)
    
    # Clean up supplier columns
    supplier_columns = [
        'ID Supplier', 'Nama Supplier', 'ID Brand', 'ID Store', 
        'Nama Store', 'Hari Order', 'Min. Purchase', 'Trading Term',
        'Promo Factor', 'Delay Factor'
    ]
    for col in supplier_columns:
        if col in merged_df.columns:
            merged_df[col] = merged_df[col].fillna('' if merged_df[col].dtype == 'object' else 0)
    
    return merged_df

def calculate_inventory_metrics(df_clean):
    """
    Calculate various inventory metrics including safety stock, reorder points, and PO quantities.
    
    Args:
        df_clean (pd.DataFrame): Input dataframe with required columns
        
    Returns:
        pd.DataFrame: Dataframe with added calculated columns
    """
    import numpy as np
    import pandas as pd
    
    # Ensure we're working with a copy to avoid SettingWithCopyWarning
    df = df_clean.copy()
    
    # Set display options
    pd.set_option('display.float_format', '{:.2f}'.format)

    # Normalise stock column name
    stock_col = 'Stok' if 'Stok' in df.columns else 'Stock'

    # Force the columns we need into numeric form
    numeric_cols = [
        stock_col, 'Daily Sales', 'Max. Daily Sales', 'Lead Time',
        'Max. Lead Time', 'Sedang PO', 'HPP', 'Lead Time Sedang PO'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    try:
        # 1. Safety stock calculation
        df['Safety stock'] = (df['Max. Daily Sales'] * df['Max. Lead Time']) - (df['Daily Sales'] * df['Lead Time'])
        df['Safety stock'] = df['Safety stock'].apply(lambda x: np.ceil(x)).fillna(0).astype(int)
        
        # 2. Reorder point calculation
        df['Reorder point'] = np.ceil((df['Daily Sales'] * df['Lead Time']) + df['Safety stock']).fillna(0).astype(int)
        
        # 3. Stock cover for 30 days
        df['Stock cover 30 days'] = (df['Daily Sales'] * 30).apply(lambda x: np.ceil(x)).fillna(0).astype(int)
        
        # 4. Current stock days cover
        df['current_stock_days_cover'] = np.where(
            df['Daily Sales'] > 0,
            df[stock_col] / df['Daily Sales'],
            0
        )
        
        # 5. Is open PO flag
        df['is_open_po'] = np.where(
            (df['current_stock_days_cover'] < 30) & 
            (df['Stok'] <= df['Reorder point']), 1, 0
        )
        
        # 6. Initial PO quantity
        df['initial_qty_po'] = df['Stock cover 30 days'] - df[stock_col] - df.get('Sedang PO', 0)
        df['initial_qty_po'] = (
            pd.Series(
                np.where(df['is_open_po'] == 1, df['initial_qty_po'], 0),
                index=df.index
            )
            .clip(lower=0)
            .astype(int)
        )
        
        # 7. Emergency PO quantity
        df['emergency_po_qty'] = np.where(
            df.get('Sedang PO', 0) > 0,
            np.maximum(0, (df['Lead Time Sedang PO'] - df['current_stock_days_cover']) * df['Daily Sales']),
            np.ceil((df['Max. Lead Time'] - df['current_stock_days_cover']) * df['Daily Sales'])
        )
        
        # Clean up emergency PO quantities
        df['emergency_po_qty'] = (
            df['emergency_po_qty']
            .replace([np.inf, -np.inf], 0)
            .fillna(0)
            .clip(lower=0)
            .astype(int)
        )
        
        # 8. Updated regular PO quantity
        df['updated_regular_po_qty'] = (df['initial_qty_po'] - df['emergency_po_qty']).clip(lower=0).astype(int)
        
        # 9. Final updated regular PO quantity (enforce minimum order)
        df['final_updated_regular_po_qty'] = np.where(
            (df['updated_regular_po_qty'] > 0) & 
            (df['updated_regular_po_qty'] < df['Min. Order']),
            df['Min. Order'],
            df['updated_regular_po_qty']
        ).astype(int)
        
        # 10. Calculate costs if by multiplying with contribution percentage
        df['emergency_po_cost'] = (df['emergency_po_qty'] * df['HPP']).round(2)
        df['final_updated_regular_po_cost'] = (df['final_updated_regular_po_qty'] * df['HPP']).round(2)
        
        # Clean up any remaining NaN or infinite values
        df = df.fillna(0)
        
        return df
        
    except Exception as e:
        print(f"Error in calculate_inventory_metrics: {str(e)}")
        return df_clean

def clean_po_data_v1(df, location, contribution_pct=100, padang_sales=None):
    """Clean and prepare PO data with contribution calculations."""
    try:
        # Create a copy to avoid modifying the original DataFrame
        df = df.copy()

        # Keep original column names but strip any extra whitespace
        df.columns = df.columns.str.strip()

        # Define required columns (using original case)
        required_columns = [
            'Brand', 'SKU', 'Nama', 'Toko', 'Stok',
            'Daily Sales', 'Max. Daily Sales', 'Lead Time',
            'Max. Lead Time', 'Min. Order', 'Sedang PO', 'HPP'
        ]
        
        # Find actual column names in the DataFrame (case-sensitive)
        available_columns = {col.strip(): col for col in df.columns}
        columns_to_keep = []
        
        for col in required_columns:
            if col in available_columns:
                columns_to_keep.append(available_columns[col])
            else:
                print(f"Warning: Column '{col}' not found in input data")
                # Add as empty column if it's required
                if col in ['Brand', 'SKU', 'HPP']:  # These are critical
                    df[col] = ''

        # Select only the columns we need
        df = df[[col for col in columns_to_keep if col in df.columns]]

        # Check for missing required columns
        missing_columns = [col for col in ['Brand', 'SKU', 'HPP'] if col not in df.columns]
        if missing_columns:
            raise ValueError(
                f"Missing required columns: {missing_columns}. "
                f"Available columns: {df.columns.tolist()}"
            )

        # Clean brand column
        if 'Brand' in df.columns:
            df['Brand'] = df['Brand'].astype(str).str.strip()

        # Convert numeric columns with better error handling
        numeric_columns = [
            'Stok', 'Daily Sales', 'Max. Daily Sales', 'Lead Time',
            'Max. Lead Time', 'Sedang PO', 'HPP'
        ]

        for col in numeric_columns:
            if col in df.columns:
                try:
                    # First convert to string, clean, then to numeric
                    df[col] = (
                        df[col]
                        .astype(str)
                        .str.replace(r'[^\d.,-]', '', regex=True)  # Remove non-numeric except .,-
                        .str.replace(',', '.', regex=False)         # Convert commas to decimal points
                        .replace('', '0')                           # Empty strings to '0'
                        .astype(float)                              # Convert to float
                        .fillna(0)                                  # Fill any remaining NaNs with 0
                    )
                except Exception as e:
                    print(f"Warning: Could not convert column '{col}' to numeric: {str(e)}")
                    df[col] = 0  # Set to 0 if conversion fails

        # Add contribution percentage and calculate costs
        contribution_pct = float(contribution_pct)
        df['contribution_pct'] = contribution_pct
        df['contribution_ratio'] = contribution_pct / 100

        # Add default values for other required columns
        if 'Lead Time Sedang PO' not in df.columns:
            df['Lead Time Sedang PO'] = ''

        location_upper = location.upper()
        exempt_stores = {"PADANG", "SOETA", "BALIKPAPAN"}
        needs_padang_override = (location_upper not in exempt_stores) or (contribution_pct < 100)

        print(f"Processing store: {location} - {contribution_pct}%")

        # Add 'Is in Padang' column
        if padang_sales is not None:
            padang_skus = set(padang_sales['SKU'].astype(str).unique())
            df['Is in Padang'] = df['SKU'].astype(str).isin(padang_skus).astype(int)
        else:
            print("Warning: No Padang sales data provided. 'Is in Padang' will be set to 0 for all SKUs.")
            df['Is in Padang'] = 0

        if not needs_padang_override:
            # If no override needed, ensure we have the original sales columns
            if 'Daily Sales' not in df.columns and 'Orig Daily Sales' in df.columns:
                df['Daily Sales'] = df['Orig Daily Sales']
            if 'Max. Daily Sales' not in df.columns and 'Orig Max. Daily Sales' in df.columns:
                df['Max. Daily Sales'] = df['Orig Max. Daily Sales']
            return df

        if padang_sales is None:
            raise ValueError(
                "Padang sales data is required for stores outside Padang/Soeta/Balikpapan "
                "or any store with contribution < 100%."
            )

        # Process Padang sales data - keep original column names
        padang_df = padang_sales.copy()
        padang_df.columns = padang_df.columns.str.strip()  # Only strip whitespace
        
        # Ensure required columns exist in padang_df
        required_padang_cols = ['SKU', 'Daily Sales', 'Max. Daily Sales']
        missing_padang_cols = [col for col in required_padang_cols if col not in padang_df.columns]
        
        if missing_padang_cols:
            raise ValueError(
                f"Padang sales data is missing required columns: {missing_padang_cols}. "
                f"Available columns: {padang_df.columns.tolist()}"
            )

        # Save original sales columns if they exist
        if 'Daily Sales' in df.columns and 'Orig Daily Sales' not in df.columns:
            df = df.rename(columns={'Daily Sales': 'Orig Daily Sales'})
        if 'Max. Daily Sales' in df.columns and 'Orig Max. Daily Sales' not in df.columns:
            df = df.rename(columns={'Max. Daily Sales': 'Orig Max. Daily Sales'})

        print("Overriding with Padang sales data...")
        contribution_ratio = contribution_pct / 100

        # Merge with Padang's sales data using original column names
        df = df.merge(
            padang_df[['SKU', 'Daily Sales', 'Max. Daily Sales']].rename(columns={
                'Daily Sales': 'Padang Daily Sales',
                'Max. Daily Sales': 'Padang Max Daily Sales'
            }),
            on='SKU',
            how='left'
        )

        # Calculate adjusted sales based on contribution and 'Is in Padang' flag
        if 'Padang Daily Sales' in df.columns and 'Orig Daily Sales' in df.columns:
            df['Daily Sales'] = np.where(
                df['Is in Padang'] == 1,
                df['Padang Daily Sales'] * contribution_ratio,
                df['Orig Daily Sales']
            )
            
        if 'Padang Max Daily Sales' in df.columns and 'Orig Max. Daily Sales' in df.columns:
            df['Max. Daily Sales'] = np.where(
                df['Is in Padang'] == 1,
                df['Padang Max Daily Sales'] * contribution_ratio,
                df['Orig Max. Daily Sales']
            )

        # Drop intermediate columns
        columns_to_drop = [
            'Padang Daily Sales', 'Padang Max Daily Sales'
        ]
        df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

        return df

    except Exception as e:
        print(f"Error in clean_po_data: {str(e)}")
        import traceback
        traceback.print_exc()
        
        # Return empty DataFrame with required columns if there's an error
        desired_columns = [
            'Brand', 'SKU', 'Nama', 'HPP', 'Toko', 'Stok', 
            'Daily Sales', 'Max. Daily Sales', 'Lead Time', 
            'Max. Lead Time', 'Sedang PO', 'contribution_pct',
            'emergency_po_cost', 'final_updated_regular_po_cost',
            'Is in Padang'  # Added new column
        ]
        return pd.DataFrame(columns=desired_columns)

def clean_po_data(df, location, contribution_pct=100, padang_sales=None):
    """Clean and prepare PO data with contribution calculations."""
    try:
        # Create a copy to avoid modifying the original DataFrame
        df = df.copy()

        # Keep original column names but strip any extra whitespace
        df.columns = df.columns.str.strip()

        # Define required columns (using original case)
        required_columns = [
            'Brand', 'SKU', 'Nama', 'Toko', 'Stok',
            'Daily Sales', 'Max. Daily Sales', 'Lead Time',
            'Max. Lead Time', 'Min. Order', 'Sedang PO', 'HPP'
        ]
        
        # Find actual column names in the DataFrame (case-sensitive)
        available_columns = {col.strip(): col for col in df.columns}
        columns_to_keep = []
        
        for col in required_columns:
            if col in available_columns:
                columns_to_keep.append(available_columns[col])
            else:
                print(f"Warning: Column '{col}' not found in input data")
                # Add as empty column if it's required
                if col in ['Brand', 'SKU', 'HPP']:  # These are critical
                    df[col] = ''

        # Select only the columns we need
        df = df[[col for col in columns_to_keep if col in df.columns]]

        # Check for missing required columns
        missing_columns = [col for col in ['Brand', 'SKU', 'HPP'] if col not in df.columns]
        if missing_columns:
            raise ValueError(
                f"Missing required columns: {missing_columns}. "
                f"Available columns: {df.columns.tolist()}"
            )

        # Clean brand column
        if 'Brand' in df.columns:
            df['Brand'] = df['Brand'].astype(str).str.strip()

        # Convert SKU to string and clean it
        if 'SKU' in df.columns:
            df['SKU'] = df['SKU'].astype(str).str.strip()

        # Convert numeric columns with better error handling
        numeric_columns = [
            'Stok', 'Daily Sales', 'Max. Daily Sales', 'Lead Time',
            'Max. Lead Time', 'Sedang PO', 'HPP', 'Min. Order'
        ]

        for col in numeric_columns:
            if col in df.columns:
                try:
                    # First convert to string, clean, then to numeric
                    df[col] = (
                        df[col]
                        .astype(str)
                        .str.replace(r'[^\d.,-]', '', regex=True)  # Remove non-numeric except .,-
                        .str.replace(',', '.', regex=False)         # Convert commas to decimal points
                        .replace('', '0')                           # Empty strings to '0'
                        .astype(float)                              # Convert to float
                        .fillna(0)                                  # Fill any remaining NaNs with 0
                    )
                except Exception as e:
                    print(f"Warning: Could not convert column '{col}' to numeric: {str(e)}")
                    df[col] = 0  # Set to 0 if conversion fails

        # Add contribution percentage and calculate costs
        contribution_pct = float(contribution_pct)
        df['contribution_pct'] = contribution_pct
        df['contribution_ratio'] = contribution_pct / 100

        # Add default values for other required columns
        if 'Lead Time Sedang PO' not in df.columns:
            df['Lead Time Sedang PO'] = 5  # Default value

        location_upper = location.upper()
        exempt_stores = {"PADANG", "SOETA", "BALIKPAPAN"}
        needs_padang_override = (location_upper not in exempt_stores) or (contribution_pct < 100)

        print(f"Processing store: {location} - {contribution_pct}%")

        # Add 'Is in Padang' column
        if padang_sales is not None:
            # Ensure padang_sales has the required columns
            padang_sales = padang_sales.copy()
            padang_sales.columns = padang_sales.columns.str.strip()
            
            # Convert SKU to string in both dataframes
            df['SKU'] = df['SKU'].astype(str).str.strip()
            padang_sales['SKU'] = padang_sales['SKU'].astype(str).str.strip()
            
            padang_skus = set(padang_sales['SKU'].unique())
            df['Is in Padang'] = df['SKU'].isin(padang_skus).astype(int)
        else:
            print("Warning: No Padang sales data provided. 'Is in Padang' will be set to 0 for all SKUs.")
            df['Is in Padang'] = 0

        if not needs_padang_override:
            return df

        if padang_sales is None:
            raise ValueError(
                "Padang sales data is required for stores outside Padang/Soeta/Balikpapan "
                "or any store with contribution < 100%."
            )

        # Process Padang sales data
        padang_df = padang_sales.copy()
        padang_df.columns = padang_df.columns.str.strip()
        
        # Ensure required columns exist
        required_cols = ['SKU', 'Daily Sales', 'Max. Daily Sales']
        missing_cols = [col for col in required_cols if col not in padang_df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns in Padang sales data: {missing_cols}")

        # Save original sales columns if they exist
        if 'Daily Sales' in df.columns:
            df['Orig Daily Sales'] = df['Daily Sales']
        if 'Max. Daily Sales' in df.columns:
            df['Orig Max. Daily Sales'] = df['Max. Daily Sales']

        print("Overriding with Padang sales data...")
        
        # Ensure SKU is string in both dataframes before merge
        df['SKU'] = df['SKU'].astype(str)
        padang_df['SKU'] = padang_df['SKU'].astype(str)
        
        # Merge with Padang's sales data
        df = df.merge(
            padang_df[['SKU', 'Daily Sales', 'Max. Daily Sales']].rename(columns={
                'Daily Sales': 'Padang Daily Sales',
                'Max. Daily Sales': 'Padang Max Daily Sales'
            }),
            on='SKU',
            how='left'
        )

        # Calculate adjusted sales based on contribution and 'Is in Padang' flag
        if 'Padang Daily Sales' in df.columns and 'Orig Daily Sales' in df.columns:
            df['Daily Sales'] = np.where(
                df['Is in Padang'] == 1,
                df['Padang Daily Sales'] * df['contribution_ratio'],
                df['Orig Daily Sales']
            )
            
        if 'Padang Max Daily Sales' in df.columns and 'Orig Max. Daily Sales' in df.columns:
            df['Max. Daily Sales'] = np.where(
                df['Is in Padang'] == 1,
                df['Padang Max Daily Sales'] * df['contribution_ratio'],
                df['Orig Max. Daily Sales']
            )

        # Drop intermediate columns
        columns_to_drop = [
            'Padang Daily Sales', 'Padang Max Daily Sales', 
            'Orig Daily Sales', 'Orig Max. Daily Sales'
        ]
        df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

        return df

    except Exception as e:
        print(f"Error in clean_po_data: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def get_store_name_from_filename(filename):
    """Extract store name from filename, handling different patterns."""
    # Remove file extension and split by spaces
    name_parts = Path(filename).stem.split()
    
    # Handle cases like "002 Miss Glam Pekanbaru.csv" -> "Pekanbaru"
    # or "01 Miss Glam Padang.csv" -> "Padang"
    if len(name_parts) >= 3 and name_parts[1].lower() == 'miss' and name_parts[2].lower() == 'glam':
        return ' '.join(name_parts[3:]).strip().upper()
    elif len(name_parts) >= 2 and name_parts[0].lower() == 'miss' and name_parts[1].lower() == 'glam':
        return ' '.join(name_parts[2:]).strip().upper()
    # Fallback: take everything after the first space
    elif ' ' in filename:
        return ' '.join(name_parts[1:]).strip().upper()
    return name_parts[0].upper()

def read_csv_file(file_path):
    # List of (separator, encoding) combinations to try
    formats_to_try = [
        (',', 'utf-8'),      # Standard CSV with comma
        (';', 'utf-8'),      # Semicolon with UTF-8
        (',', 'latin1'),     # Comma with Latin1
        (';', 'latin1'),     # Semicolon with Latin1
        (',', 'cp1252'),     # Windows-1252 encoding
        (';', 'cp1252')
    ]
    
    for sep, enc in formats_to_try:
        try:
            df = pd.read_csv(
                file_path,
                sep=sep,
                decimal=',',
                thousands='.',
                encoding=enc,
                engine='python'  # More consistent behavior with Python engine
            )
            # If we get here, the file was read successfully
            if not df.empty:
                return df
        except (UnicodeDecodeError, pd.errors.ParserError, pd.errors.EmptyDataError) as e:
            continue  # Try next format
        except Exception as e:
            print(f"Unexpected error reading {file_path} with sep='{sep}', encoding='{enc}': {str(e)}")
            continue
    
    # If we get here, all attempts failed
    print(f"Failed to read {file_path} with any known format")
    return None

def process_po_file(file_path, supplier_df, store_contrib, df_padang):
    """Process a single PO file and return merged data and summary."""
    print(f"\nProcessing PO file: {file_path.name} ....")
    
    try:
        # Extract location from filename using the new function
        location = get_store_name_from_filename(file_path.name)
        print(f"  - Extracted location: {location}")  # Debug print
        
        contribution_pct = get_contribution_pct(location, store_contrib)
        
        # Read the CSV with error handling
        try:
            # Try reading with different encodings if needed
            # df = read_csv_file(file_path)
            df = read_excel_file(file_path)
            
            # Check if DataFrame is empty
            if df.empty:
                raise ValueError("File is empty")
                
            # Clean the data
            df_clean = clean_po_data(df,location, contribution_pct, df_padang)
            
            # Skip if cleaning failed
            if df_clean.empty:
                raise ValueError("Data cleaning failed")
            
            # Merge with suppliers
            merged_df = merge_with_suppliers(df_clean, supplier_df)

            # calculate metrics PO
            merged_df = calculate_inventory_metrics(merged_df)
            
            # Generate summary
            padang_count = (merged_df['Nama Store'] == 'Miss Glam Padang').sum()
            other_supplier_count = ((merged_df['Nama Store'] != 'Miss Glam Padang') & 
                                  (merged_df['Nama Store'] != '')).sum()
            
            summary = {
                'file': file_path.name,
                'location': location,
                'contribution_pct': contribution_pct,
                'total_rows': len(merged_df),
                'padang_suppliers': int(padang_count),
                'other_suppliers': int(other_supplier_count),
                'no_supplier': int((merged_df['Nama Store'] == '').sum()),
                'status': 'Success'
            }
            
            return merged_df, summary
            
        except Exception as e:
            raise Exception(f"Error processing file data: {str(e)}")
            
    except Exception as e:
        error_msg = f"Error processing {file_path.name}: {str(e)}"
        print(f"  - {error_msg}")
        return None, {
            'file': file_path.name,
            'location': location if 'location' in locals() else 'Unknown',
            'contribution_pct': contribution_pct if 'contribution_pct' in locals() else 0,
            'total_rows': 0,
            'padang_suppliers': 0,
            'other_suppliers': 0,
            'no_supplier': 0,
            'status': f"Error: {str(e)[:100]}"  # Truncate long error messages
        }

def load_padang_data_v(padang_path):
    print("Parsing Padang data...")

    try:
        df = pd.read_csv(padang_path, sep=';', decimal=',', thousands='.', encoding='utf-8-sig')

        print(f"Padang data loaded successfully..")

        return df
    except Exception as e:
        raise Exception(f"Error loading Padang data: {str(e)}")

def load_padang_data(padang_path):
    """Load Padang data from either CSV or Excel file.
    
    Args:
        padang_path: Path to the input file (CSV or XLSX)
        
    Returns:
        pd.DataFrame: Loaded and cleaned Padang data
        
    Raises:
        ValueError: If the file format is not supported or file cannot be read
    """
    print(f"Loading Padang data from {padang_path}...")
    
    # Check file extension
    file_ext = str(padang_path).lower().split('.')[-1]
    
    try:
        if file_ext == 'csv':
            # Read CSV with multiple possible delimiters and encodings
            try:
                df = pd.read_csv(padang_path, sep=';', decimal=',', thousands='.', encoding='utf-8-sig')
            except (UnicodeDecodeError, pd.errors.ParserError):
                # Try with different encoding if UTF-8 fails
                df = pd.read_csv(padang_path, sep=',', decimal='.', thousands=',', encoding='latin1')
                
        elif file_ext in ['xlsx', 'xls']:
            # Read Excel file
            df = pd.read_excel(padang_path, engine='openpyxl')
        else:
            raise ValueError(f"Unsupported file format: {file_ext}. Please provide a CSV or Excel file.")
            
        # Basic data cleaning
        if not df.empty:
            # Strip whitespace from string columns
            df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
            
            # Convert column names to standard format
            df.columns = df.columns.str.strip()
            
            # Ensure SKU column is string type
            if 'SKU' in df.columns:
                df['SKU'] = df['SKU'].astype(str).str.strip()
                
        print(f"Successfully loaded Padang data with {len(df)} rows")
        return df
        
    except Exception as e:
        raise ValueError(f"Error loading Padang data from {padang_path}: {str(e)}")

def format_number_for_csv(x):
    """Format numbers for CSV output with Indonesian locale (comma as decimal, dot as thousand)"""
    if pd.isna(x) or x == '':
        return x
    try:
        if isinstance(x, (int, float)):
            if x == int(x):  # Whole number
                return f"{int(x):,d}".replace(",", ".")
            else:  # Decimal number
                return f"{x:,.2f}".replace(",", "X").replace(".", ",").replace("X", ".")
        return x
    except:
        return x

def save_to_csv(df, filename):
    df_output = df.copy()

    # Ensure SKU stays textual
    if 'SKU' in df_output.columns:
        df_output['SKU'] = df_output['SKU'].apply(lambda x: f'="{x}"')

    # Only touch numeric columns
    numeric_cols = df_output.select_dtypes(include=['number']).columns
    for col in numeric_cols:
        df_output[col] = df_output[col].apply(format_number_for_csv)

    output_path = OUTPUT_DIR / filename
    df_output.to_csv(output_path, index=False, sep=';', decimal=',', encoding='utf-8-sig')
    print(f"File saved to {output_path}")

def clean_and_convert(df):
    """Clean and convert DataFrame columns to appropriate types."""
    if df is None or df.empty:
        return df

    # Make a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Convert all columns to string first to handle NaN/None consistently
    for col in df.columns:
        df[col] = df[col].astype(str)
    
    # Define NA values that should be treated as empty/missing
    na_values = list(NA_VALUES)
    
    # Process each column
    for col in df.columns:
        # Replace NA values with empty string first (treating them as literals, not regex)
        df[col] = df[col].replace(na_values, '', regex=False)
        
        # Skip empty columns
        if df[col].empty:
            continue

        # Convert numeric columns
        if col in NUMERIC_COLUMNS:
            # Convert to numeric, coercing errors to NaN, then fill with 0
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            # For non-numeric columns, ensure they're strings and strip whitespace
            df[col] = df[col].astype(str).str.strip()
            # Replace empty strings with NaN and then fill with empty string
            df[col] = df[col].replace('', np.nan).fillna('')

    return df

def read_excel_file(file_path):
    """
    Read an Excel file with robust error handling for problematic values.
    """
    try:
        print(f"\nReading excel file: {file_path.name}...")
        
        # First, read the file with openpyxl directly to handle the data more carefully
        from openpyxl import load_workbook
        
        # Load the workbook
        wb = load_workbook(
            filename=file_path,
            read_only=True,    # Read-only mode is faster and uses less memory
            data_only=True,    # Get the stored value instead of the formula
            keep_links=False   # Don't load external links
        )
        
        # Get the first sheet
        ws = wb.active
        
        # Get headers from the first row
        headers = []
        for idx, cell in enumerate(next(ws.iter_rows(values_only=True))):
            header = str(cell).strip() if cell not in (None, '') else f"Column_{idx + 1}"
            headers.append(header)
        
        # Initialize data rows
        data = []
        
        # Process each row
        for row in ws.iter_rows(min_row=2, values_only=True):  # Skip header row
            row_data = []
            for cell in row:
                if cell is None:
                    row_data.append('')
                    continue

                cell_str = str(cell).strip()
                if cell_str.upper() in NA_VALUES:
                    row_data.append('')
                else:
                    row_data.append(cell_str)
            
            # Only add row if it has data
            if any(cell != '' for cell in row_data):
                data.append(row_data)
        
        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)
        
        # Normalize column data types
        df = clean_and_convert(df)
        
        print(f"✅ Successfully processed {file_path.name} with {len(df)} rows")
        return df
        
    except Exception as e:
        print(f"❌ Error processing {file_path.name}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def save_file(df, file_path, file_format='csv', **kwargs):
    """
    Save DataFrame to file with consistent extension and content type.
    
    Args:
        df: DataFrame to save
        file_path: Path object or string for the output file
        file_format: 'csv' or 'xlsx'
        **kwargs: Additional arguments to pass to to_csv or to_excel
        
    Returns:
        Path: The path where the file was saved
    """
    # Ensure file_path is a Path object
    file_path = Path(file_path)
    
    # Ensure the directory exists
    file_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Ensure the correct file extension
    if not file_path.suffix.lower() == f'.{file_format}':
        file_path = file_path.with_suffix(f'.{file_format}')
    
    # Make a copy to avoid modifying the original
    df_output = df.copy()
    
    # Common preprocessing
    if 'SKU' in df_output.columns:
        df_output['SKU'] = df_output['SKU'].astype(str).str.strip()
        if file_format == 'xlsx':
            # For Excel, wrap SKU in ="..." to preserve leading zeros
            df_output['SKU'] = df_output['SKU'].apply(lambda x: f'="{x}"')
    
    # Format numbers for CSV if needed
    if file_format == 'csv':
        numeric_cols = df_output.select_dtypes(include=['number']).columns
        for col in numeric_cols:
            df_output[col] = df_output[col].apply(format_number_for_csv)
    
    # Save based on format
    if file_format == 'csv':
        df_output.to_csv(
            file_path, 
            index=False, 
            sep=';', 
            decimal=',', 
            encoding='utf-8-sig',
            **kwargs
        )
    elif file_format == 'xlsx':
        with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
            df_output.to_excel(writer, index=False, **kwargs)
            
            # Format SKU column as text in Excel
            if 'SKU' in df_output.columns:
                ws = writer.sheets[list(writer.sheets.keys())[0]]
                sku_col_idx = df_output.columns.get_loc("SKU") + 1
                for row in ws.iter_rows(
                    min_row=2,  # Skip header
                    max_row=ws.max_row,
                    min_col=sku_col_idx,
                    max_col=sku_col_idx
                ):
                    for cell in row:
                        cell.number_format = numbers.FORMAT_TEXT
    else:
        raise ValueError(f"Unsupported file format: {file_format}")
    
    print(f"File saved to {file_path}")
    return file_path

def save_to_complete_format(df, filename, file_format='csv', **kwargs):
    """
    Save Complete format file with consistent extension.
    
    Args:
        df: Input DataFrame
        filename: Output filename (with or without extension)
        file_format: 'csv' or 'xlsx'
        **kwargs: Additional arguments for save_file
    """
    # Filter to only include rows with regular PO qty > 0
    df_output = df[df['final_updated_regular_po_qty'] > 0].copy()

    # Ensure SKU stays textual
    if 'SKU' in df_output.columns:
        df_output['SKU'] = df_output['SKU'].apply(lambda x: f'="{x}"')
    
    # Ensure output directory exists
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
    # Save with consistent extension
    output_path = OUTPUT_DIR / filename
    return save_file(df_output, output_path, file_format=file_format, **kwargs)

def save_to_m2_format(df, filename, file_format='csv', **kwargs):
    """
    Save M2 format file with consistent extension.
    
    Args:
        df: Input DataFrame
        filename: Output filename (with or without extension)
        file_format: 'csv' or 'xlsx'
        **kwargs: Additional arguments for save_file
    """
    # Filter to only include rows with regular PO qty > 0
    df_filtered = df[df['final_updated_regular_po_qty'] > 0].copy()
    df_output = df_filtered[['Toko', 'SKU', 'HPP', 'final_updated_regular_po_qty']]
    
    # Ensure output directory exists
    OUTPUT_M2_DIR.mkdir(parents=True, exist_ok=True)
    
    # Save with consistent extension
    output_path = OUTPUT_M2_DIR / filename
    return save_file(df_output, output_path, file_format=file_format, **kwargs)

def save_to_emergency_po_format(df, filename, file_format='csv', **kwargs):
    """
    Save emergency PO format file with consistent extension.
    
    Args:
        df: Input DataFrame
        filename: Output filename (with or without extension)
        file_format: 'csv' or 'xlsx'
        **kwargs: Additional arguments for save_file
    """
    # Filter to only include rows with emergency PO qty > 0
    df_filtered = df[df['emergency_po_qty'] > 0].copy()
    df_output = df_filtered[[
        'Brand', 'SKU', 'Nama', 'Toko', 'HPP', 
        'emergency_po_qty', 'emergency_po_cost'
    ]]
    
    # Ensure SKU stays textual
    if 'SKU' in df_output.columns:
        df_output['SKU'] = df_output['SKU'].apply(lambda x: f'="{x}"')

    # Ensure output directory exists
    OUTPUT_EMERGENCY_DIR.mkdir(parents=True, exist_ok=True)
    
    # Save with consistent extension
    output_path = OUTPUT_EMERGENCY_DIR / filename
    return save_file(df_output, output_path, file_format=file_format, **kwargs)

def main():
    # Load data
    supplier_df = load_supplier_data(SUPPLIER_PATH)
    store_contrib = load_store_contribution(STORE_CONTRIBUTION_PATH)
    all_summaries = []

    # get padang df first
    df_padang = load_padang_data('data/rawpo/xlsx/1. Miss Glam Padang.xlsx')

    # test_xlsx_convert()

    # Process each PO file
    for file_path in sorted(RAWPO_XLSX_DIR.glob('*.xlsx')):
        print(f"Reading file path: {file_path}")
        try:
            merged_df, summary = process_po_file(file_path, supplier_df, store_contrib, df_padang)

            save_to_complete_format(merged_df, file_path.stem)
            save_to_m2_format(merged_df, file_path.stem)
            save_to_emergency_po_format(merged_df, file_path.stem)

            summary['output_path'] = str(output_path)
            
            # Print progress
            print(f"  - Location: {summary['location']}")
            print(f"  - Contribution: {summary['contribution_pct']}%")
            print(f"  - Rows processed: {summary['total_rows']}")
            print(f"  - 'Miss Glam Padang' suppliers: {summary['padang_suppliers']} rows")
            print(f"  - Other suppliers: {summary['other_suppliers']} rows")
            print(f"  - No supplier data: {summary['no_supplier']} rows")
            print(f"  - Saved to: {output_path}")
            
            all_summaries.append(summary)
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {str(e)}")
            continue
    
    # Display final summary
    if all_summaries:
        print("\nProcessing complete! Summary:")
        summary_df = pd.DataFrame(all_summaries)
        display(summary_df)
        
        # Show sample of last processed file
        print("\nSample of the last processed file:")
        display(merged_df)
    else:
        print("\nNo files were processed successfully.")

# Run the main function
if __name__ == "__main__":
    main()
