# CedarSim Pre-Removal Analysis
*Comprehensive Impact Analysis Before Data Cleaning*

## Objectives
1. Analyze SKUs with missing lead times (298 SKUs)
2. Analyze unmapped SKUs (197 SKUs)
3. Ensure 229 validation SKUs are preserved
4. Assess business impact of data removals
5. Generate comprehensive impact report


In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")


Libraries imported successfully


In [3]:
# Load all data sources
data_dir = Path('data')
inventory_file = data_dir / '2025-07-14_MDRH_Inventory_Storage_Burn_Rates_V3.xlsx'
validation_file = data_dir / '2025-08-04_MDRH_Inventory_Safety_Stock_Sample_Items.xlsx'

print(f"Loading data from: {data_dir}")
print(f"Inventory file exists: {inventory_file.exists()}")
print(f"Validation file exists: {validation_file.exists()}")

# Load main inventory data
sku_data = pd.read_excel(inventory_file, sheet_name='01. Data (Department Rollup)')
print(f"SKU data shape: {sku_data.shape}")

# Load historical demand data
demand_data = pd.read_excel(inventory_file, sheet_name='02. Full Data')
print(f"Demand data shape: {demand_data.shape}")

# Load validation data
validation_data = pd.read_excel(validation_file)
print(f"Validation data shape: {validation_data.shape}")


Loading data from: data
Inventory file exists: True
Validation file exists: True
SKU data shape: (6372, 28)
Demand data shape: (86411, 16)
Validation data shape: (229, 27)


In [4]:
# Find and record the 298 SKUs with missing lead times
print("=== FINDING SKUs WITH MISSING LEAD TIMES ===")

print(f"Total SKUs in Department Rollup: {len(sku_data):,}")

# Find SKUs with missing lead times
missing_lead_time_skus = sku_data[sku_data['Avg_Lead Time'].isnull()]
print(f"SKUs with missing lead times: {len(missing_lead_time_skus):,}")

# Create detailed record of missing lead time SKUs
missing_skus_record = missing_lead_time_skus[['Oracle Item Number', 'Item Description', 'Department Name', 'Supplier Name', 'Avg Daily Burn Rate']].copy()
missing_skus_record['Removal_Reason'] = 'Missing Lead Time'
missing_skus_record['Removal_Date'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')

print(f"\n=== MISSING LEAD TIME SKUs RECORD ===")
print(f"Total SKUs to be removed: {len(missing_skus_record)}")
print(f"Departments affected: {missing_skus_record['Department Name'].nunique()}")
print(f"Suppliers affected: {missing_skus_record['Supplier Name'].nunique()}")

# Show sample of SKUs being removed
print(f"\nSample of SKUs being removed:")
print(missing_skus_record.head(10))

# Save the record
missing_skus_record.to_csv('missing_lead_time_skus_record.csv', index=False)
print(f"\nRecord saved to: missing_lead_time_skus_record.csv")


=== FINDING SKUs WITH MISSING LEAD TIMES ===
Total SKUs in Department Rollup: 6,372
SKUs with missing lead times: 298

=== MISSING LEAD TIME SKUs RECORD ===
Total SKUs to be removed: 298
Departments affected: 20
Suppliers affected: 35

Sample of SKUs being removed:
    Oracle Item Number                                   Item Description  \
254             683374                       Cannula Inner 10mm Old Style   
274             803852             Catheter Follower Phillips Woven  16fr   
309             803918                    Kit Cath Suction 6Fr With Glove   
356             804011                     Support Lumbar-Sacrl Med 30-36   
544             605852                          Fltr Air Rear Pnl Sta Sys   
621             677644        Test Kit Alere Influenza A&B Latex Free 24T   
643             677737  Test BinaxNow Malaria Latex Free Alere 10 per ...   
660             714790    Kit Collection SWAB Genexpertobgyn STR 50 Tests   
661             714803           Diagnost

In [5]:
# Clean demand records by removing records for SKUs with missing lead times
print("=== CLEANING DEMAND RECORDS ===")

print(f"Total demand records before cleaning: {len(demand_data):,}")

# Get list of SKUs to remove (Oracle Item Numbers)
skus_to_remove = missing_lead_time_skus['Oracle Item Number'].tolist()
print(f"SKUs to remove from demand data: {len(skus_to_remove)}")

# Remove demand records for SKUs with missing lead times
demand_data_cleaned = demand_data[~demand_data['Oracle Item Number'].isin(skus_to_remove)]
print(f"Demand records after removing missing lead time SKUs: {len(demand_data_cleaned):,}")
print(f"Demand records removed: {len(demand_data) - len(demand_data_cleaned):,}")

# Check for remaining missing lead times in cleaned demand data
remaining_missing_lead_times = demand_data_cleaned['Avg_Lead Time'].isnull().sum()
print(f"Remaining missing lead times in demand data: {remaining_missing_lead_times}")

# Save cleaned demand data
demand_data_cleaned.to_csv('demand_data_cleaned.csv', index=False)
print(f"Cleaned demand data saved to: demand_data_cleaned.csv")


=== CLEANING DEMAND RECORDS ===
Total demand records before cleaning: 86,411
SKUs to remove from demand data: 298
Demand records after removing missing lead time SKUs: 85,603
Demand records removed: 808
Remaining missing lead times in demand data: 0
Cleaned demand data saved to: demand_data_cleaned.csv


In [6]:
# Create clean SKU inventory data (remove SKUs with missing lead times)
print("=== CREATING CLEAN SKU INVENTORY DATA ===")

# Create clean SKU data by removing SKUs with missing lead times
sku_data_cleaned = sku_data[sku_data['Avg_Lead Time'].notnull()].copy()
print(f"SKU data before cleaning: {len(sku_data):,}")
print(f"SKU data after cleaning: {len(sku_data_cleaned):,}")
print(f"SKUs removed: {len(sku_data) - len(sku_data_cleaned):,}")

# Verify no missing lead times remain
remaining_missing_lead_times_sku = sku_data_cleaned['Avg_Lead Time'].isnull().sum()
print(f"Remaining missing lead times in SKU data: {remaining_missing_lead_times_sku}")

# Save cleaned SKU data
sku_data_cleaned.to_csv('sku_data_cleaned.csv', index=False)
print(f"Cleaned SKU data saved to: sku_data_cleaned.csv")

# Generate summary report
print(f"\n=== CLEANING SUMMARY REPORT ===")
print(f"Original SKUs: {len(sku_data):,}")
print(f"SKUs removed (missing lead times): {len(sku_data) - len(sku_data_cleaned):,}")
print(f"Clean SKUs remaining: {len(sku_data_cleaned):,}")
print(f"Data completeness: {len(sku_data_cleaned)/len(sku_data)*100:.1f}%")

print(f"\nOriginal demand records: {len(demand_data):,}")
print(f"Demand records removed: {len(demand_data) - len(demand_data_cleaned):,}")
print(f"Clean demand records remaining: {len(demand_data_cleaned):,}")
print(f"Demand data completeness: {len(demand_data_cleaned)/len(demand_data)*100:.1f}%")


=== CREATING CLEAN SKU INVENTORY DATA ===
SKU data before cleaning: 6,372
SKU data after cleaning: 6,074
SKUs removed: 298
Remaining missing lead times in SKU data: 0
Cleaned SKU data saved to: sku_data_cleaned.csv

=== CLEANING SUMMARY REPORT ===
Original SKUs: 6,372
SKUs removed (missing lead times): 298
Clean SKUs remaining: 6,074
Data completeness: 95.3%

Original demand records: 86,411
Demand records removed: 808
Clean demand records remaining: 85,603
Demand data completeness: 99.1%


In [None]:
# Create simulation-ready Excel file with clean data
print("=== CREATING SIMULATION-READY EXCEL FILE ===")

# Create a new Excel file with two sheets
output_file = 'CedarSim_Simulation_Ready_Data.xlsx'

with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    # Sheet 1: Clean SKU Inventory Data
    sku_data_cleaned.to_excel(writer, sheet_name='01_SKU_Inventory_Clean', index=False)
    
    # Sheet 2: Clean Demand Data
    demand_data_cleaned.to_excel(writer, sheet_name='02_Demand_Data_Clean', index=False)
    
    # Sheet 3: Validation Data (for reference)
    validation_data.to_excel(writer, sheet_name='03_Validation_Sample', index=False)
    
    # Sheet 4: Removal Record (for tracking)
    missing_skus_record.to_excel(writer, sheet_name='04_Removed_SKUs_Record', index=False)

print(f"Simulation-ready Excel file created: {output_file}")
print(f"Sheets created:")
print(f"  - 01_SKU_Inventory_Clean: {len(sku_data_cleaned):,} SKUs")
print(f"  - 02_Demand_Data_Clean: {len(demand_data_cleaned):,} demand records")
print(f"  - 03_Validation_Sample: {len(validation_data):,} validation SKUs")
print(f"  - 04_Removed_SKUs_Record: {len(missing_skus_record):,} removed SKUs")

# Verify the file was created
import os
if os.path.exists(output_file):
    file_size = os.path.getsize(output_file) / (1024*1024)  # Size in MB
    print(f"\nFile size: {file_size:.2f} MB")
    print("✅ Simulation-ready data file created successfully!")
else:
    print("❌ Error creating file")


=== CREATING SIMULATION-READY EXCEL FILE ===
Simulation-ready Excel file created: CedarSim_Simulation_Ready_Data.xlsx
Sheets created:
  - 01_SKU_Inventory_Clean: 6,074 SKUs
  - 02_Demand_Data_Clean: 85,603 demand records
  - 03_Validation_Sample: 229 validation SKUs
  - 04_Removed_SKUs_Record: 298 removed SKUs

File size: 5.59 MB
✅ Simulation-ready data file created successfully!


: 